Initial commit for deploying the project
Browse files- .gitattributes +1 -0
- src/.gitignore +18 -0
- src/Dockerfile +32 -0
- src/README.md +2 -0
- src/__init__.py +0 -0
- src/agents_rag/__init__.py +0 -0
- src/agents_rag/agents.py +78 -0
- src/agents_rag/crew.py +52 -0
- src/agents_rag/tasks.py +60 -0
- src/agents_rag/tools.py +14 -0
- src/app.py +476 -0
- src/assistants/__init__.py +0 -0
- src/assistants/assistant_v1.py +266 -0
- src/assistants/gemini_assistant.py +481 -0
- src/chroma_rag/chroma.sqlite3 +3 -0
- src/chroma_rag/e8532e8c-980b-42a2-acd8-df1effcd172a/data_level0.bin +3 -0
- src/chroma_rag/e8532e8c-980b-42a2-acd8-df1effcd172a/header.bin +3 -0
- src/chroma_rag/e8532e8c-980b-42a2-acd8-df1effcd172a/length.bin +3 -0
- src/chroma_rag/e8532e8c-980b-42a2-acd8-df1effcd172a/link_lists.bin +0 -0
- src/notebooks/read_pdf.py +15 -0
- src/notebooks/temp.py +58 -0
- src/response_api.py +292 -0
- src/run_app.py +45 -0
- src/utils/__init__.py +0 -0
- src/utils/download.py +126 -0
- src/utils/download_video.py +170 -0
- src/utils/json_file_record.json +14 -0
- src/utils/knowledge_base.py +523 -0
- src/utils/vectorDB.py +291 -0
.gitattributes
CHANGED
|
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
| 33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
| 33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
| 36 |
+
src/chroma_rag/chroma.sqlite3 filter=lfs diff=lfs merge=lfs -text
|
src/.gitignore
ADDED
|
@@ -0,0 +1,18 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Ignore all MP3 files
|
| 2 |
+
*.mp3
|
| 3 |
+
|
| 4 |
+
# Ignore all PDF files
|
| 5 |
+
*.pdf
|
| 6 |
+
|
| 7 |
+
# Ignore all MP4 files
|
| 8 |
+
*.mp4
|
| 9 |
+
|
| 10 |
+
# Ignore all TXT files
|
| 11 |
+
*.txt
|
| 12 |
+
|
| 13 |
+
# Ignore environment files
|
| 14 |
+
.env
|
| 15 |
+
|
| 16 |
+
# Ignore Python cache files
|
| 17 |
+
__pycache__/
|
| 18 |
+
*.pyc
|
src/Dockerfile
ADDED
|
@@ -0,0 +1,32 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Use Python 3.10 as base image
|
| 2 |
+
FROM python:3.10-slim
|
| 3 |
+
|
| 4 |
+
# Set working directory in container
|
| 5 |
+
WORKDIR /app/src
|
| 6 |
+
|
| 7 |
+
# Copy requirements file
|
| 8 |
+
COPY requirements.txt .
|
| 9 |
+
|
| 10 |
+
# Install dependencies
|
| 11 |
+
RUN pip install --no-cache-dir -r requirements.txt
|
| 12 |
+
|
| 13 |
+
# Copy the entire project
|
| 14 |
+
COPY . .
|
| 15 |
+
|
| 16 |
+
# Expose ports for API and Streamlit
|
| 17 |
+
EXPOSE 8000 8501
|
| 18 |
+
|
| 19 |
+
# Create script to run both services
|
| 20 |
+
RUN echo '#!/bin/bash\n\
|
| 21 |
+
python response_api.py &\n\
|
| 22 |
+
sleep 5\n\
|
| 23 |
+
streamlit run app.py' > ./start.sh
|
| 24 |
+
|
| 25 |
+
# Make the script executable
|
| 26 |
+
RUN chmod +x ./start.sh
|
| 27 |
+
|
| 28 |
+
# Copy .env file and set environment variables
|
| 29 |
+
ENV $(cat .env | xargs)
|
| 30 |
+
|
| 31 |
+
# Run the start script
|
| 32 |
+
CMD ["./start.sh"]
|
src/README.md
ADDED
|
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Gemini_RAG
|
| 2 |
+
Create multi-functionality LLM application which support RAG functionality with multiple agents.
|
src/__init__.py
ADDED
|
File without changes
|
src/agents_rag/__init__.py
ADDED
|
File without changes
|
src/agents_rag/agents.py
ADDED
|
@@ -0,0 +1,78 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from crewai import Agent
|
| 2 |
+
from dotenv import load_dotenv
|
| 3 |
+
|
| 4 |
+
from agents_rag.tools import tool
|
| 5 |
+
|
| 6 |
+
load_dotenv()
|
| 7 |
+
import os
|
| 8 |
+
|
| 9 |
+
from langchain_google_genai import ChatGoogleGenerativeAI
|
| 10 |
+
|
| 11 |
+
## call the gemini models
|
| 12 |
+
llm = ChatGoogleGenerativeAI(
|
| 13 |
+
model="gemini-2.0-flash-exp",
|
| 14 |
+
verbose=True,
|
| 15 |
+
temperature=0.5,
|
| 16 |
+
google_api_key=os.getenv("GEMINI_API_KEY"),
|
| 17 |
+
)
|
| 18 |
+
|
| 19 |
+
# Creating a senior researcher agent with memory and verbose mode
|
| 20 |
+
|
| 21 |
+
news_researcher = Agent(
|
| 22 |
+
role="Senior Researcher",
|
| 23 |
+
goal="Unccover ground breaking technologies in {topic}",
|
| 24 |
+
verbose=True,
|
| 25 |
+
memory=True,
|
| 26 |
+
backstory=(
|
| 27 |
+
"Driven by curiosity, you're at the forefront of"
|
| 28 |
+
"innovation, eager to explore and share knowledge that could change"
|
| 29 |
+
"the world."
|
| 30 |
+
),
|
| 31 |
+
tools=[tool],
|
| 32 |
+
llm=llm,
|
| 33 |
+
allow_delegation=True,
|
| 34 |
+
)
|
| 35 |
+
|
| 36 |
+
## creating a write agent with custom tools responsible in writing news blog
|
| 37 |
+
|
| 38 |
+
news_writer = Agent(
|
| 39 |
+
role="Writer",
|
| 40 |
+
goal="Narrate compelling tech stories about {topic}",
|
| 41 |
+
verbose=True,
|
| 42 |
+
memory=True,
|
| 43 |
+
backstory=(
|
| 44 |
+
"With a flair for simplifying complex topics, you craft"
|
| 45 |
+
"engaging narratives that captivate and educate, bringing new"
|
| 46 |
+
"discoveries to light in an accessible manner."
|
| 47 |
+
),
|
| 48 |
+
tools=[tool],
|
| 49 |
+
llm=llm,
|
| 50 |
+
allow_delegation=False,
|
| 51 |
+
)
|
| 52 |
+
|
| 53 |
+
writer_rag = Agent(
|
| 54 |
+
role="Query Answerer using Web Search",
|
| 55 |
+
goal="Create comprehensive and well-structured answers to given queries based on web search content.",
|
| 56 |
+
backstory=(
|
| 57 |
+
"Specialized in synthesizing information from multiple sources into "
|
| 58 |
+
"coherent and engaging content while maintaining accuracy."
|
| 59 |
+
),
|
| 60 |
+
tools=[tool], # Ensure 'tool' is suitable for web search tasks
|
| 61 |
+
llm=llm, # Ensure 'llm' is configured for this agent's needs
|
| 62 |
+
verbose=True,
|
| 63 |
+
memory=True,
|
| 64 |
+
allow_delegation=False,
|
| 65 |
+
)
|
| 66 |
+
|
| 67 |
+
rag_agent = Agent(
|
| 68 |
+
role="RAG agent that answer of given query in quick manner along with citation using provided context only don't search on only of you don't found relvatent context then denie to give answer and also answer give in given format",
|
| 69 |
+
goal="Create comprehensive and well-structured answers to given queries with citation based on given context only and also give the answer in given format.",
|
| 70 |
+
backstory=(
|
| 71 |
+
"Specialized in synthesizing information from multiple sources into "
|
| 72 |
+
"coherent and engaging content while maintaining accuracy."
|
| 73 |
+
), # Ensure 'tool' is suitable for web search tasks
|
| 74 |
+
llm=llm, # Ensure 'llm' is configured for this agent's needs
|
| 75 |
+
verbose=True,
|
| 76 |
+
memory=True,
|
| 77 |
+
allow_delegation=False,
|
| 78 |
+
)
|
src/agents_rag/crew.py
ADDED
|
@@ -0,0 +1,52 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from crewai import Crew, Process
|
| 2 |
+
|
| 3 |
+
from agents_rag.agents import rag_agent, writer_rag
|
| 4 |
+
from agents_rag.tasks import rag_task, write_rag_task
|
| 5 |
+
|
| 6 |
+
|
| 7 |
+
def get_crew_response(query=None, context=None, url=None):
|
| 8 |
+
if url != "None":
|
| 9 |
+
print("in crwe url")
|
| 10 |
+
## Forming the tech focused crew with some enhanced configuration
|
| 11 |
+
crew_search = Crew(
|
| 12 |
+
agents=[writer_rag],
|
| 13 |
+
tasks=[write_rag_task],
|
| 14 |
+
process=Process.sequential,
|
| 15 |
+
)
|
| 16 |
+
|
| 17 |
+
## starting the task execution process wiht enhanced feedback
|
| 18 |
+
result_web = crew_search.kickoff(
|
| 19 |
+
inputs={
|
| 20 |
+
"query": f"{query}",
|
| 21 |
+
"url": f"{url}",
|
| 22 |
+
}
|
| 23 |
+
)
|
| 24 |
+
print(result_web)
|
| 25 |
+
|
| 26 |
+
result_web = result_web.removeprefix("```json")
|
| 27 |
+
result_web = result_web.removesuffix("```")
|
| 28 |
+
result_web = result_web.replace("```", "")
|
| 29 |
+
|
| 30 |
+
return result_web
|
| 31 |
+
else:
|
| 32 |
+
print("In crew else")
|
| 33 |
+
## Forming the tech focused crew with some enhanced configuration
|
| 34 |
+
crew_rag = Crew(
|
| 35 |
+
agents=[rag_agent],
|
| 36 |
+
tasks=[rag_task],
|
| 37 |
+
process=Process.sequential,
|
| 38 |
+
)
|
| 39 |
+
|
| 40 |
+
## starting the task execution process wiht enhanced feedback
|
| 41 |
+
result_rag = crew_rag.kickoff(
|
| 42 |
+
inputs={
|
| 43 |
+
"query": f"{query}",
|
| 44 |
+
"context": f"{context}",
|
| 45 |
+
}
|
| 46 |
+
)
|
| 47 |
+
|
| 48 |
+
result_rag = result_rag.removeprefix("```json")
|
| 49 |
+
result_rag = result_rag.removesuffix("```")
|
| 50 |
+
result_rag = result_rag.replace("```", "")
|
| 51 |
+
|
| 52 |
+
return result_rag
|
src/agents_rag/tasks.py
ADDED
|
@@ -0,0 +1,60 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from crewai import Task
|
| 2 |
+
|
| 3 |
+
from agents_rag.agents import news_researcher, news_writer, rag_agent, writer_rag
|
| 4 |
+
from agents_rag.tools import tool
|
| 5 |
+
|
| 6 |
+
# Research task
|
| 7 |
+
research_task = Task(
|
| 8 |
+
description=(
|
| 9 |
+
"Identify the next big trend in {topic}."
|
| 10 |
+
"Focus on identifying pros and cons and the overall narrative."
|
| 11 |
+
"Your final report should clearly articulate the key points,"
|
| 12 |
+
"its market opportunities, and potential risks."
|
| 13 |
+
),
|
| 14 |
+
expected_output="A comprehensive 3 paragraphs long report on the latest AI trends.",
|
| 15 |
+
tools=[tool],
|
| 16 |
+
agent=news_researcher,
|
| 17 |
+
)
|
| 18 |
+
|
| 19 |
+
# Writing task with language model configuration
|
| 20 |
+
write_task = Task(
|
| 21 |
+
description=(
|
| 22 |
+
"Compose an insightful article on {topic}."
|
| 23 |
+
"Focus on the latest trends and how it's impacting the industry."
|
| 24 |
+
"This article should be easy to understand, engaging, and positive."
|
| 25 |
+
),
|
| 26 |
+
expected_output="A 4 paragraph article on {topic} advancements formatted as markdown.",
|
| 27 |
+
tools=[tool],
|
| 28 |
+
agent=news_writer,
|
| 29 |
+
async_execution=False,
|
| 30 |
+
# output_file="new-blog-post.md", # Example of output customization
|
| 31 |
+
)
|
| 32 |
+
|
| 33 |
+
write_rag_task = Task(
|
| 34 |
+
description=(
|
| 35 |
+
"Answer the Give query: {query} like RAG application from given url: {url} only. if user manention url then use only that url to answer the query otherwise you can do web-search to answer the qurey. "
|
| 36 |
+
"Focus on the latest trends."
|
| 37 |
+
"The answer should be easy to understand, engaging, and positive."
|
| 38 |
+
"Also Show source of content That use to generate generate answer."
|
| 39 |
+
"All source are also show in citation and each citation have number and in end there is map of citation number and source."
|
| 40 |
+
),
|
| 41 |
+
expected_output="A Give normal length answer if user asked something specific like small or big or more this some type of word then give answer according to that length on {query} advancements formatted as only one JSON format. in which it has two field first is Answer in which generated answer is written and second field is context in which the context value that are used to for generation of answer and third is citations. If required, then give in table format.",
|
| 42 |
+
agent=writer_rag,
|
| 43 |
+
tools=[tool], # Ensure 'tool' is appropriate for the task
|
| 44 |
+
async_execution=False,
|
| 45 |
+
output_file="new-blog-post.md", # Specify the desired output file path
|
| 46 |
+
)
|
| 47 |
+
|
| 48 |
+
rag_task = Task(
|
| 49 |
+
description=(
|
| 50 |
+
"Answer the Give query: {query} like RAG application of using given context: {context} "
|
| 51 |
+
"Focus on the latest trends."
|
| 52 |
+
"The answer should be easy to understand, engaging, and positive."
|
| 53 |
+
"Also Show source of content That use to generate generate answer."
|
| 54 |
+
"All source are also show in citation and each citation have number and in end there is map of citation number and source."
|
| 55 |
+
),
|
| 56 |
+
expected_output="A Give normal length answer if user asked something specific like small or big or or more this some type of word then give answer according to that length on {query} advancements formatted as JSON only in which it has three field first is Answer in which generated answer is written and second field is context in which the context value that are used to for generation of answer and third is citations. If required, then give in table format.",
|
| 57 |
+
agent=rag_agent,
|
| 58 |
+
async_execution=False,
|
| 59 |
+
# output_file="new-blog-post.md", # Specify the desired output file path
|
| 60 |
+
)
|
src/agents_rag/tools.py
ADDED
|
@@ -0,0 +1,14 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
## https://serper.dev/
|
| 2 |
+
|
| 3 |
+
from dotenv import load_dotenv
|
| 4 |
+
|
| 5 |
+
load_dotenv()
|
| 6 |
+
import os
|
| 7 |
+
|
| 8 |
+
os.environ["SERPER_API_KEY"] = os.getenv("SERPER_API_KEY")
|
| 9 |
+
|
| 10 |
+
|
| 11 |
+
from crewai_tools import SerperDevTool
|
| 12 |
+
|
| 13 |
+
# Initialize the tool for internet searching capabilities
|
| 14 |
+
tool = SerperDevTool()
|
src/app.py
ADDED
|
@@ -0,0 +1,476 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# import json
|
| 2 |
+
# import os
|
| 3 |
+
# import re
|
| 4 |
+
# import time
|
| 5 |
+
# from pathlib import Path
|
| 6 |
+
# from typing import Dict, List
|
| 7 |
+
|
| 8 |
+
# import requests
|
| 9 |
+
# import streamlit as st
|
| 10 |
+
|
| 11 |
+
|
| 12 |
+
# def extract_and_verify_url(string):
|
| 13 |
+
# """Extract the URL from the string and verify if it points to content."""
|
| 14 |
+
# url_pattern = re.compile(
|
| 15 |
+
# r"http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+"
|
| 16 |
+
# )
|
| 17 |
+
# match = url_pattern.search(string)
|
| 18 |
+
# if match:
|
| 19 |
+
# url = match.group()
|
| 20 |
+
# try:
|
| 21 |
+
# response = requests.head(url, allow_redirects=True, timeout=5)
|
| 22 |
+
# if response.status_code == 200:
|
| 23 |
+
# return {
|
| 24 |
+
# "url": url,
|
| 25 |
+
# "status": "Valid and content exists",
|
| 26 |
+
# "status_code": 200,
|
| 27 |
+
# }
|
| 28 |
+
# else:
|
| 29 |
+
# return {
|
| 30 |
+
# "url": url,
|
| 31 |
+
# "status": f"Invalid (HTTP {response.status_code})",
|
| 32 |
+
# "status_code": response.status_code,
|
| 33 |
+
# }
|
| 34 |
+
# except requests.RequestException as e:
|
| 35 |
+
# return {"url": url, "status": f"Error: {e}"}
|
| 36 |
+
# return {"url": "", "status": "No URL found"}
|
| 37 |
+
|
| 38 |
+
|
| 39 |
+
# def stream_data(data_val):
|
| 40 |
+
# for word in data_val.split(" "):
|
| 41 |
+
# yield word + " "
|
| 42 |
+
# time.sleep(0.02)
|
| 43 |
+
|
| 44 |
+
|
| 45 |
+
# def list_files_in_directory(directory):
|
| 46 |
+
# """List all files in the given directory."""
|
| 47 |
+
# try:
|
| 48 |
+
# return os.listdir(directory)
|
| 49 |
+
# except FileNotFoundError:
|
| 50 |
+
# return []
|
| 51 |
+
|
| 52 |
+
|
| 53 |
+
# def save_uploaded_file(uploaded_file, directory):
|
| 54 |
+
# """Save the uploaded file to the specified directory."""
|
| 55 |
+
# with open(os.path.join(directory, uploaded_file.name), "wb") as f:
|
| 56 |
+
# f.write(uploaded_file.getbuffer())
|
| 57 |
+
|
| 58 |
+
# return os.path.join(directory, uploaded_file.name)
|
| 59 |
+
|
| 60 |
+
|
| 61 |
+
# def call_rag_api(
|
| 62 |
+
# query: str,
|
| 63 |
+
# url: str,
|
| 64 |
+
# is_uploaded: bool = False,
|
| 65 |
+
# ) -> Dict:
|
| 66 |
+
# """Call the RAG API and get a response."""
|
| 67 |
+
# endpoint = f"http://127.0.0.1:8000/get-response"
|
| 68 |
+
# payload = {"query": query, "is_uploaded": is_uploaded, "url": url}
|
| 69 |
+
|
| 70 |
+
# try:
|
| 71 |
+
# response = requests.post(endpoint, json=payload)
|
| 72 |
+
# response.raise_for_status()
|
| 73 |
+
# result = response.json()
|
| 74 |
+
# print(type(result))
|
| 75 |
+
# print(result)
|
| 76 |
+
# return {
|
| 77 |
+
# "status": "success",
|
| 78 |
+
# "response": result["response"],
|
| 79 |
+
# "context": result["context"],
|
| 80 |
+
# "citations": result["citations"],
|
| 81 |
+
# }
|
| 82 |
+
# except requests.exceptions.RequestException as e:
|
| 83 |
+
# return {"status": "error", "message": str(e)}
|
| 84 |
+
|
| 85 |
+
|
| 86 |
+
# # Main Streamlit app
|
| 87 |
+
# def main():
|
| 88 |
+
# st.title("🤖 RAG Chat Assistant")
|
| 89 |
+
|
| 90 |
+
# # Sidebar inputs and actions
|
| 91 |
+
# with st.sidebar:
|
| 92 |
+
# st.header("📚 Document Control")
|
| 93 |
+
|
| 94 |
+
# # Input directory
|
| 95 |
+
# directory = st.text_input("Enter the directory path:", value="data")
|
| 96 |
+
|
| 97 |
+
# # Ensure directory exists
|
| 98 |
+
# Path(directory).mkdir(parents=True, exist_ok=True)
|
| 99 |
+
|
| 100 |
+
# # Display files in the directory
|
| 101 |
+
# st.subheader("Files in Directory")
|
| 102 |
+
# files = list_files_in_directory(directory)
|
| 103 |
+
# if files:
|
| 104 |
+
# st.write(files)
|
| 105 |
+
# else:
|
| 106 |
+
# st.write("No files found.")
|
| 107 |
+
|
| 108 |
+
# # Upload file
|
| 109 |
+
# st.subheader("Upload a File")
|
| 110 |
+
# uploaded_file = st.file_uploader(
|
| 111 |
+
# "Choose a file", type=["txt", "pdf", "doc", "docx", "mp3", "mp4"]
|
| 112 |
+
# )
|
| 113 |
+
# if uploaded_file:
|
| 114 |
+
# file_path = save_uploaded_file(uploaded_file, directory)
|
| 115 |
+
# with st.spinner:
|
| 116 |
+
# endpoint = f"http://127.0.0.1:8000/process-file"
|
| 117 |
+
# payload = {"file_path": file_path}
|
| 118 |
+
# result = requests.post(endpoint, json=payload)
|
| 119 |
+
# st.success(result["response"])
|
| 120 |
+
# st.success(f"File '{uploaded_file.name}' uploaded successfully!")
|
| 121 |
+
|
| 122 |
+
# # Delete a file
|
| 123 |
+
# st.subheader("Delete a File")
|
| 124 |
+
# if files:
|
| 125 |
+
# file_to_delete = st.selectbox("Select a file to delete:", options=files)
|
| 126 |
+
# if st.button("Delete File"):
|
| 127 |
+
# try:
|
| 128 |
+
# os.remove(os.path.join(directory, file_to_delete))
|
| 129 |
+
# st.success(f"File '{file_to_delete}' deleted successfully!")
|
| 130 |
+
# except Exception as e:
|
| 131 |
+
# st.error(f"Error deleting file: {e}")
|
| 132 |
+
|
| 133 |
+
# # Chat system status
|
| 134 |
+
# st.divider()
|
| 135 |
+
# st.markdown("### System Status")
|
| 136 |
+
# if uploaded_file:
|
| 137 |
+
# st.success("Document loaded")
|
| 138 |
+
# else:
|
| 139 |
+
# st.info("No document uploaded")
|
| 140 |
+
|
| 141 |
+
# # Initialize chat history
|
| 142 |
+
# if "messages" not in st.session_state:
|
| 143 |
+
# st.session_state.messages = []
|
| 144 |
+
|
| 145 |
+
# # Display chat messages in streaming manner
|
| 146 |
+
# chat_placeholder = st.container()
|
| 147 |
+
|
| 148 |
+
# for message in st.session_state.messages:
|
| 149 |
+
# with chat_placeholder.container():
|
| 150 |
+
# with st.chat_message(message["role"]):
|
| 151 |
+
# st.markdown(message["content"])
|
| 152 |
+
# if message["role"] == "assistant" and "context" in message:
|
| 153 |
+
# with st.expander("View source context"):
|
| 154 |
+
# st.info(message["context"])
|
| 155 |
+
|
| 156 |
+
# # Chat input
|
| 157 |
+
# if prompt := st.chat_input("Ask me anything about your documents..."):
|
| 158 |
+
# res = extract_and_verify_url(prompt)
|
| 159 |
+
# print(res)
|
| 160 |
+
# if res["url"] != None:
|
| 161 |
+
# print(res["url"])
|
| 162 |
+
# st.session_state.messages.append({"role": "user", "content": prompt})
|
| 163 |
+
# with chat_placeholder.container():
|
| 164 |
+
# with st.chat_message("user"):
|
| 165 |
+
# st.markdown(prompt)
|
| 166 |
+
|
| 167 |
+
# with chat_placeholder.container():
|
| 168 |
+
# with st.chat_message("assistant"):
|
| 169 |
+
# with st.spinner("Thinking..."):
|
| 170 |
+
# result = call_rag_api(
|
| 171 |
+
# url=res["url"],
|
| 172 |
+
# query=prompt,
|
| 173 |
+
# is_uploaded=uploaded_file is not None,
|
| 174 |
+
# )
|
| 175 |
+
|
| 176 |
+
# if result["status"] == "success":
|
| 177 |
+
|
| 178 |
+
# response_content = result["response"]
|
| 179 |
+
# context = result["context"]
|
| 180 |
+
# citations = result["citations"]
|
| 181 |
+
|
| 182 |
+
# # st.markdown(response_content)
|
| 183 |
+
# st.write_stream(stream_data(response_content))
|
| 184 |
+
# with st.expander("View source context"):
|
| 185 |
+
# st.json(citations)
|
| 186 |
+
|
| 187 |
+
# st.session_state.messages.append(
|
| 188 |
+
# {
|
| 189 |
+
# "role": "assistant",
|
| 190 |
+
# "content": response_content,
|
| 191 |
+
# "context": context,
|
| 192 |
+
# "citations": citations,
|
| 193 |
+
# }
|
| 194 |
+
# )
|
| 195 |
+
# else:
|
| 196 |
+
# st.error(
|
| 197 |
+
# f"Error: {result.get('message', 'Unknown error occurred')}"
|
| 198 |
+
# )
|
| 199 |
+
|
| 200 |
+
|
| 201 |
+
# if __name__ == "__main__":
|
| 202 |
+
# main()
|
| 203 |
+
|
| 204 |
+
|
| 205 |
+
import json
|
| 206 |
+
import os
|
| 207 |
+
import re
|
| 208 |
+
import time
|
| 209 |
+
from pathlib import Path
|
| 210 |
+
from typing import Dict, List
|
| 211 |
+
|
| 212 |
+
import requests
|
| 213 |
+
import streamlit as st
|
| 214 |
+
from streamlit_js_eval import streamlit_js_eval
|
| 215 |
+
|
| 216 |
+
|
| 217 |
+
def extract_and_verify_url(string):
|
| 218 |
+
"""Extract the URL from the string and verify if it points to content."""
|
| 219 |
+
url_pattern = re.compile(
|
| 220 |
+
r"http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+"
|
| 221 |
+
)
|
| 222 |
+
match = url_pattern.search(string)
|
| 223 |
+
if match:
|
| 224 |
+
url = match.group()
|
| 225 |
+
try:
|
| 226 |
+
response = requests.head(url, allow_redirects=True, timeout=5)
|
| 227 |
+
if response.status_code == 200:
|
| 228 |
+
return {
|
| 229 |
+
"url": url,
|
| 230 |
+
"status": "Valid and content exists",
|
| 231 |
+
"status_code": 200,
|
| 232 |
+
}
|
| 233 |
+
else:
|
| 234 |
+
return {
|
| 235 |
+
"url": url,
|
| 236 |
+
"status": f"Invalid (HTTP {response.status_code})",
|
| 237 |
+
"status_code": response.status_code,
|
| 238 |
+
}
|
| 239 |
+
except requests.RequestException as e:
|
| 240 |
+
return {"url": url, "status": f"Error: {e}"}
|
| 241 |
+
return {"url": "", "status": "No URL found"}
|
| 242 |
+
|
| 243 |
+
|
| 244 |
+
def stream_data(data_val):
|
| 245 |
+
for word in data_val.split(" "):
|
| 246 |
+
yield word + " "
|
| 247 |
+
time.sleep(0.02)
|
| 248 |
+
|
| 249 |
+
|
| 250 |
+
def list_files_in_directory(directory):
|
| 251 |
+
"""List all files in the given directory."""
|
| 252 |
+
try:
|
| 253 |
+
return os.listdir(directory)
|
| 254 |
+
except FileNotFoundError:
|
| 255 |
+
return []
|
| 256 |
+
|
| 257 |
+
|
| 258 |
+
def save_uploaded_file(uploaded_file, directory):
|
| 259 |
+
"""Save the uploaded file to the specified directory."""
|
| 260 |
+
with open(os.path.join(directory, uploaded_file.name), "wb") as f:
|
| 261 |
+
f.write(uploaded_file.getbuffer())
|
| 262 |
+
return os.path.join(directory, uploaded_file.name)
|
| 263 |
+
|
| 264 |
+
|
| 265 |
+
def call_rag_api(query: str, url: str, is_uploaded: bool = False) -> Dict:
|
| 266 |
+
"""Call the RAG API and get a response."""
|
| 267 |
+
endpoint = f"http://127.0.0.1:8000/get-response"
|
| 268 |
+
payload = {"query": query, "is_uploaded": is_uploaded, "url": url}
|
| 269 |
+
|
| 270 |
+
try:
|
| 271 |
+
response = requests.post(endpoint, json=payload)
|
| 272 |
+
response.raise_for_status()
|
| 273 |
+
result = response.json()
|
| 274 |
+
return {
|
| 275 |
+
"status": "success",
|
| 276 |
+
"response": result["response"],
|
| 277 |
+
"context": result["context"],
|
| 278 |
+
"citations": result["citations"],
|
| 279 |
+
}
|
| 280 |
+
except requests.exceptions.RequestException as e:
|
| 281 |
+
return {"status": "error", "message": str(e)}
|
| 282 |
+
|
| 283 |
+
|
| 284 |
+
def call_llm_api(query: str) -> Dict:
|
| 285 |
+
"""Call the LLM API for answering questions."""
|
| 286 |
+
endpoint = f"http://127.0.0.1:8000/llm-response"
|
| 287 |
+
payload = {"query": query}
|
| 288 |
+
|
| 289 |
+
try:
|
| 290 |
+
response = requests.post(endpoint, json=payload)
|
| 291 |
+
response.raise_for_status()
|
| 292 |
+
result = response.json()
|
| 293 |
+
return {"status": "success", "response": result["response"]}
|
| 294 |
+
except requests.exceptions.RequestException as e:
|
| 295 |
+
return {"status": "error", "message": str(e)}
|
| 296 |
+
|
| 297 |
+
|
| 298 |
+
# Main Streamlit app
|
| 299 |
+
def main():
|
| 300 |
+
st.title("🤖 Multi-Functional Chat Assistant")
|
| 301 |
+
|
| 302 |
+
# Sidebar inputs and actions
|
| 303 |
+
with st.sidebar:
|
| 304 |
+
|
| 305 |
+
# Chat functionality selection
|
| 306 |
+
mode = st.radio("Select Mode:", ["LLM Answering", "Web Search Agent", "RAG"])
|
| 307 |
+
|
| 308 |
+
st.header("📂 Document Control")
|
| 309 |
+
|
| 310 |
+
# Input directory
|
| 311 |
+
directory = st.text_input("Enter the directory path:", value="data")
|
| 312 |
+
|
| 313 |
+
# Ensure directory exists
|
| 314 |
+
Path(directory).mkdir(parents=True, exist_ok=True)
|
| 315 |
+
|
| 316 |
+
# Display files in the directory
|
| 317 |
+
st.subheader("Files in Directory")
|
| 318 |
+
files = list_files_in_directory(directory)
|
| 319 |
+
if files:
|
| 320 |
+
st.write(files)
|
| 321 |
+
else:
|
| 322 |
+
st.write("No files found.")
|
| 323 |
+
|
| 324 |
+
# Upload file
|
| 325 |
+
st.subheader("Upload a File")
|
| 326 |
+
uploaded_file = st.file_uploader(
|
| 327 |
+
"Choose a file", type=["txt", "pdf", "doc", "docx", "mp3", "mp4"]
|
| 328 |
+
)
|
| 329 |
+
if uploaded_file:
|
| 330 |
+
file_path = save_uploaded_file(uploaded_file, directory)
|
| 331 |
+
endpoint = f"http://127.0.0.1:8000/process-file"
|
| 332 |
+
payload = {"file_path": file_path}
|
| 333 |
+
|
| 334 |
+
with st.spinner("File is in process..."):
|
| 335 |
+
response = requests.post(endpoint, json=payload)
|
| 336 |
+
|
| 337 |
+
st.success(f"File '{uploaded_file.name}' uploaded successfully!")
|
| 338 |
+
time.sleep(3)
|
| 339 |
+
streamlit_js_eval(js_expressions="parent.window.location.reload()")
|
| 340 |
+
|
| 341 |
+
# Delete a file
|
| 342 |
+
st.subheader("Delete a File")
|
| 343 |
+
if files:
|
| 344 |
+
file_to_delete = st.selectbox("Select a file to delete:", options=files)
|
| 345 |
+
if st.button("Delete File"):
|
| 346 |
+
try:
|
| 347 |
+
payload = {"file_path": file_to_delete}
|
| 348 |
+
endpoint = f"http://127.0.0.1:8000/delete-file"
|
| 349 |
+
|
| 350 |
+
with st.spinner("File is deleting..."):
|
| 351 |
+
response = requests.post(endpoint, json=payload)
|
| 352 |
+
os.remove(os.path.join(directory, file_to_delete))
|
| 353 |
+
|
| 354 |
+
st.success(f"File '{file_to_delete}' deleted successfully!")
|
| 355 |
+
time.sleep(3)
|
| 356 |
+
streamlit_js_eval(js_expressions="parent.window.location.reload()")
|
| 357 |
+
except Exception as e:
|
| 358 |
+
st.error(f"Error deleting file: {e}")
|
| 359 |
+
|
| 360 |
+
# Chat system status
|
| 361 |
+
st.divider()
|
| 362 |
+
st.markdown("### System Status")
|
| 363 |
+
if uploaded_file:
|
| 364 |
+
st.success("Document loaded")
|
| 365 |
+
else:
|
| 366 |
+
st.info("No document uploaded")
|
| 367 |
+
|
| 368 |
+
# Initialize chat history
|
| 369 |
+
if "messages" not in st.session_state:
|
| 370 |
+
st.session_state.messages = []
|
| 371 |
+
|
| 372 |
+
# Display chat messages in streaming manner
|
| 373 |
+
chat_placeholder = st.container()
|
| 374 |
+
|
| 375 |
+
for message in st.session_state.messages:
|
| 376 |
+
with chat_placeholder.container():
|
| 377 |
+
with st.chat_message(message["role"]):
|
| 378 |
+
st.markdown(message["content"])
|
| 379 |
+
|
| 380 |
+
# # Chat functionality selection
|
| 381 |
+
# mode = st.radio("Select Mode:", ["LLM Answering", "Web Search Agent", "RAG"])
|
| 382 |
+
|
| 383 |
+
# Chat input
|
| 384 |
+
if prompt := st.chat_input("Ask me anything..."):
|
| 385 |
+
st.session_state.messages.append({"role": "user", "content": prompt})
|
| 386 |
+
with chat_placeholder.container():
|
| 387 |
+
with st.chat_message("user"):
|
| 388 |
+
st.markdown(prompt)
|
| 389 |
+
|
| 390 |
+
with chat_placeholder.container():
|
| 391 |
+
with st.chat_message("assistant"):
|
| 392 |
+
with st.spinner("Thinking..."):
|
| 393 |
+
citations = []
|
| 394 |
+
if mode == "LLM Answering":
|
| 395 |
+
result = call_llm_api(prompt)
|
| 396 |
+
if result["status"] == "success":
|
| 397 |
+
response_content = result.get("response", "")
|
| 398 |
+
st.write_stream(stream_data(response_content))
|
| 399 |
+
|
| 400 |
+
st.session_state.messages.append(
|
| 401 |
+
{"role": "assistant", "content": response_content}
|
| 402 |
+
)
|
| 403 |
+
|
| 404 |
+
else:
|
| 405 |
+
st.error(
|
| 406 |
+
f"Error: {result.get('message', 'Unknown error occurred')}"
|
| 407 |
+
)
|
| 408 |
+
elif mode == "Web Search Agent":
|
| 409 |
+
res = extract_and_verify_url(prompt)
|
| 410 |
+
result = call_rag_api(
|
| 411 |
+
url=res.get("url", ""),
|
| 412 |
+
query=prompt,
|
| 413 |
+
is_uploaded=uploaded_file is not None,
|
| 414 |
+
)
|
| 415 |
+
if result["status"] == "success":
|
| 416 |
+
|
| 417 |
+
response_content = result["response"]
|
| 418 |
+
context = result["context"]
|
| 419 |
+
citations = result["citations"]
|
| 420 |
+
|
| 421 |
+
# st.markdown(response_content)
|
| 422 |
+
st.write_stream(stream_data(response_content))
|
| 423 |
+
# with st.expander("View source context"):
|
| 424 |
+
# st.json(citations)
|
| 425 |
+
|
| 426 |
+
st.session_state.messages.append(
|
| 427 |
+
{
|
| 428 |
+
"role": "assistant",
|
| 429 |
+
"content": response_content,
|
| 430 |
+
"context": context,
|
| 431 |
+
"citations": citations,
|
| 432 |
+
}
|
| 433 |
+
)
|
| 434 |
+
else:
|
| 435 |
+
st.error(
|
| 436 |
+
f"Error: {result.get('message', 'Unknown error occurred')}"
|
| 437 |
+
)
|
| 438 |
+
|
| 439 |
+
elif mode == "RAG":
|
| 440 |
+
res = extract_and_verify_url(prompt)
|
| 441 |
+
result = call_rag_api(
|
| 442 |
+
url="None",
|
| 443 |
+
query=prompt,
|
| 444 |
+
is_uploaded=uploaded_file is not None,
|
| 445 |
+
)
|
| 446 |
+
|
| 447 |
+
if result["status"] == "success":
|
| 448 |
+
|
| 449 |
+
response_content = result["response"]
|
| 450 |
+
context = result["context"]
|
| 451 |
+
citations = result["citations"]
|
| 452 |
+
|
| 453 |
+
# st.markdown(response_content)
|
| 454 |
+
st.write_stream(stream_data(response_content))
|
| 455 |
+
# with st.expander("View source context"):
|
| 456 |
+
# st.json(citations)
|
| 457 |
+
|
| 458 |
+
st.session_state.messages.append(
|
| 459 |
+
{
|
| 460 |
+
"role": "assistant",
|
| 461 |
+
"content": response_content,
|
| 462 |
+
"context": context,
|
| 463 |
+
"citations": citations,
|
| 464 |
+
}
|
| 465 |
+
)
|
| 466 |
+
else:
|
| 467 |
+
st.error(
|
| 468 |
+
f"Error: {result.get('message', 'Unknown error occurred')}"
|
| 469 |
+
)
|
| 470 |
+
|
| 471 |
+
with st.expander("View source context"):
|
| 472 |
+
st.json(citations)
|
| 473 |
+
|
| 474 |
+
|
| 475 |
+
if __name__ == "__main__":
|
| 476 |
+
main()
|
src/assistants/__init__.py
ADDED
|
File without changes
|
src/assistants/assistant_v1.py
ADDED
|
@@ -0,0 +1,266 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# import os
|
| 2 |
+
# from typing import List
|
| 3 |
+
|
| 4 |
+
# import google.generativeai as genai
|
| 5 |
+
# from dotenv import load_dotenv
|
| 6 |
+
# from langchain.prompts import PromptTemplate
|
| 7 |
+
# from langchain.schema import StrOutputParser
|
| 8 |
+
# from pydantic import BaseModel, Field
|
| 9 |
+
|
| 10 |
+
# from .gemini_assistant import GeminiAssistant
|
| 11 |
+
|
| 12 |
+
# load_dotenv()
|
| 13 |
+
|
| 14 |
+
# api_key = os.getenv("GEMINI_API_KEY")
|
| 15 |
+
# model_name = "gemini-2.0-flash-exp"
|
| 16 |
+
|
| 17 |
+
|
| 18 |
+
# class GenerativeAnswer(BaseModel):
|
| 19 |
+
# answer: str = Field(
|
| 20 |
+
# ...,
|
| 21 |
+
# description="""The answer to the user question, which is based only on the given sources.\n
|
| 22 |
+
# example: Artificial neural networks were inspired by information processing and distributed communication nodes in biological systems. ANNs have various differences from biological brains. Specifically, artificial neural networks tend to be static and symbolic, while the biological brain of most living organisms is dynamic (plastic) and analog (vec-001). However, ANNs have demonstrated significant capabilities in tasks such as pattern recognition, classification, and prediction, making them valuable tools in various fields (vec-002). Despite their effectiveness in certain applications, ANNs still lack the complexity and adaptability of biological brains, leading some researchers to view them as low-quality models for brain function (vec-003). Furthermore, advancements in neuroscience continue to provide insights into the workings of the brain, which may further differentiate artificial neural networks from their biological counterparts (vec-004). However, ANNs remain a powerful computational tool, offering practical solutions to a wide range of problems, albeit with inherent limitations compared to biological systems (vec-005) and so on.....
|
| 23 |
+
# """,
|
| 24 |
+
# )
|
| 25 |
+
# source_id: List[str] = Field(
|
| 26 |
+
# ...,
|
| 27 |
+
# description="The ID of a SPECIFIC source which justifies the answer.",
|
| 28 |
+
# )
|
| 29 |
+
# quote: List[str] = Field(
|
| 30 |
+
# ...,
|
| 31 |
+
# description="The VERBATIM quote from the specified source that justifies the answer.",
|
| 32 |
+
# )
|
| 33 |
+
# related_questions: List[str] = Field(
|
| 34 |
+
# ...,
|
| 35 |
+
# description="Related questions based on the user question and generated answer.",
|
| 36 |
+
# )
|
| 37 |
+
|
| 38 |
+
# # [INST]<<SYS>>You are an Intelligent Assistant for question-answering tasks.
|
| 39 |
+
# # Your knowledge base consists of documents related to Convene and its resources.
|
| 40 |
+
# # Convene is a community of Christian business leaders who gather monthly in advisory teams worldwide to share insights, face challenges, and grow in faith through executive learning. Members can join or lead a Convene Team, participate in One2One™ Coaching, and access exclusive events. The organization, founded in 1996 by Brian Thatcher and others from Saddleback Church, aims to help CEOs integrate faith with business to build profitable, faith-based enterprises. Services include peer advisory, executive coaching, and business consulting, focusing on enhancing business and leadership skills while maintaining a faith foundation. Convene emphasizes balancing business success with spiritual growth and family leadership.
|
| 41 |
+
|
| 42 |
+
# # Your [Task] is to answer questions accurately and concisely based on the context provided. You will strictly discuss topics within the scope of the context provided and Convene's activities, history, services, and community impacts as outlined.
|
| 43 |
+
# # Identify if the User question aligns with the context or Convene's interests. If the information requested is outside of the context or Convene's provided data or knowledge base, You should politely decline the request and tell the user to ask related questions.
|
| 44 |
+
# # Only provide answers based on the information available in your context and the above topics.
|
| 45 |
+
|
| 46 |
+
# # Use the following steps and pieces of retrieved context to answer the question. The answer should be detailed and concise.
|
| 47 |
+
|
| 48 |
+
|
| 49 |
+
# prompt = """
|
| 50 |
+
# You are a friendly and helpful assistant.
|
| 51 |
+
# Ensure your answers are complete, unless the user requests a more concise approach.
|
| 52 |
+
# When generating code, offer explanations for code segments as necessary and maintain good coding practices.
|
| 53 |
+
# When presented with inquiries seeking information, provide answers that reflect a deep understanding of the field, guaranteeing their correctness.
|
| 54 |
+
# For any non-english queries, respond in the same language as the prompt unless otherwise specified by the user.
|
| 55 |
+
# For prompts involving reasoning, provide a clear explanation of each step in the reasoning process before presenting the final answer.
|
| 56 |
+
|
| 57 |
+
# [Task]
|
| 58 |
+
# Your task is to answer questions accurately and concisely based on the context provided. You will strictly discuss topics within the scope of the provided context, including its activities, history, services, and impacts.
|
| 59 |
+
# Assess whether the user’s question aligns with the context or the specified areas of interest.
|
| 60 |
+
# If the information requested falls outside the provided context or scope, politely decline the request and advise the user to ask relevant questions within the given context.
|
| 61 |
+
# Only provide answers based on the information available in the given context and the specified areas of interest.
|
| 62 |
+
|
| 63 |
+
# The answer should look like:
|
| 64 |
+
# Example: Artificial neural networks were inspired by information processing and distributed communication nodes in biological systems. ANNs have various differences from biological brains. Specifically, artificial neural networks tend to be static and symbolic, while the biological brain of most living organisms is dynamic (plastic) and analog (vec-001). However, ANNs have demonstrated significant capabilities in tasks such as pattern recognition, classification, and prediction, making them valuable tools in various fields (vec-002). Despite their effectiveness in certain applications, ANNs still lack the complexity and adaptability of biological brains, leading some researchers to view them as low-quality models for brain function (vec-003). Furthermore, advancements in neuroscience continue to provide insights into the workings of the brain, which may further differentiate artificial neural networks from their biological counterparts (vec-004). However, ANNs remain a powerful computational tool, offering practical solutions to a wide range of problems, albeit with inherent limitations compared to biological systems (vec-005) and so on.....
|
| 65 |
+
# means your answer must contains source_id in given format.
|
| 66 |
+
# Output should follow the pattern defined in schema and the output should be in json format only so that it can be directly used with json.loads()
|
| 67 |
+
# <</SYS>>
|
| 68 |
+
# steps:
|
| 69 |
+
# 1. use the provided data.
|
| 70 |
+
# 2. Check if the question is related to Convene or related to the context provided. If the question is not related, ask them politely to ask related questions. Do not follow further steps but use the wrong_question_schema to output the answer.
|
| 71 |
+
# 3. use one source and extract the sentences from it that justify the answer all the source sentences are in must be unique.
|
| 72 |
+
# 4. use the sentences to generate the answer and append its source_id to the answer with prefix "vec-". You must append its source_id only if it exists. Do not hallucinate or add from your imagination. Only add source_id which you get from the context. This is a strict requirement.
|
| 73 |
+
# 5. use other source and extract the sentences from it that justify the answer. NOTE: use atleast 3 sources to generate the answer.
|
| 74 |
+
# 6. If the user has asked question related to the context or Convene, use the whole answer and user's question to generate 3 related questions. If the question is outside the context, do not generate related questions.
|
| 75 |
+
|
| 76 |
+
# NOTE: Make sure no out of bounds questions or questions that are not in the context or knowledge base are answered. This is mandatory that no out of context questions are answered."""
|
| 77 |
+
|
| 78 |
+
# gemini_rag_assistant = GeminiAssistant(
|
| 79 |
+
# model=model_name,
|
| 80 |
+
# api_key=api_key,
|
| 81 |
+
# system_prompt=prompt,
|
| 82 |
+
# output_model=GenerativeAnswer,
|
| 83 |
+
# )
|
| 84 |
+
|
| 85 |
+
|
| 86 |
+
# import os
|
| 87 |
+
# from typing import List
|
| 88 |
+
|
| 89 |
+
# import google.generativeai as genai
|
| 90 |
+
# from dotenv import load_dotenv
|
| 91 |
+
# from langchain.prompts import PromptTemplate
|
| 92 |
+
# from langchain.schema import StrOutputParser
|
| 93 |
+
# from pydantic import BaseModel, Field
|
| 94 |
+
|
| 95 |
+
# from .gemini_assistant import GeminiAssistant
|
| 96 |
+
|
| 97 |
+
# # Load environment variables
|
| 98 |
+
# load_dotenv()
|
| 99 |
+
|
| 100 |
+
# # Load the API key from the environment
|
| 101 |
+
# api_key = os.getenv("GEMINI_API_KEY")
|
| 102 |
+
# model_name = "gemini-2.0-flash-exp"
|
| 103 |
+
|
| 104 |
+
|
| 105 |
+
# # Define the output model schema using Pydantic
|
| 106 |
+
# class GenerativeAnswer(BaseModel):
|
| 107 |
+
# answer: str = Field(
|
| 108 |
+
# ...,
|
| 109 |
+
# description="""The answer to the user question, which is based only on the given sources.
|
| 110 |
+
# Example:
|
| 111 |
+
# Artificial neural networks were inspired by information processing and distributed communication nodes in biological systems.
|
| 112 |
+
# ANNs have various differences from biological brains. Specifically, artificial neural networks tend to be static and symbolic,
|
| 113 |
+
# while the biological brain of most living organisms is dynamic (plastic) and analog (vec-001). However, ANNs have demonstrated
|
| 114 |
+
# significant capabilities in tasks such as pattern recognition, classification, and prediction, making them valuable tools in various fields
|
| 115 |
+
# (vec-002). Despite their effectiveness in certain applications, ANNs still lack the complexity and adaptability of biological brains,
|
| 116 |
+
# leading some researchers to view them as low-quality models for brain function (vec-003). Furthermore, advancements in neuroscience
|
| 117 |
+
# continue to provide insights into the workings of the brain, which may further differentiate artificial neural networks from their biological counterparts
|
| 118 |
+
# (vec-004). However, ANNs remain a powerful computational tool, offering practical solutions to a wide range of problems, albeit with inherent limitations
|
| 119 |
+
# compared to biological systems (vec-005).""",
|
| 120 |
+
# )
|
| 121 |
+
# source_id: List[str] = Field(
|
| 122 |
+
# ...,
|
| 123 |
+
# description="The ID of a SPECIFIC source which justifies the answer.",
|
| 124 |
+
# )
|
| 125 |
+
# quote: List[str] = Field(
|
| 126 |
+
# ...,
|
| 127 |
+
# description="The VERBATIM quote from the specified source that justifies the answer.",
|
| 128 |
+
# )
|
| 129 |
+
# related_questions: List[str] = Field(
|
| 130 |
+
# ...,
|
| 131 |
+
# description="Related questions based on the user question and generated answer.",
|
| 132 |
+
# )
|
| 133 |
+
|
| 134 |
+
|
| 135 |
+
# # Define the system prompt
|
| 136 |
+
# prompt = """
|
| 137 |
+
# You are a friendly and helpful assistant.
|
| 138 |
+
# Ensure your answers are complete, unless the user requests a more concise approach.
|
| 139 |
+
# When generating code, offer explanations for code segments as necessary and maintain good coding practices.
|
| 140 |
+
# When presented with inquiries seeking information, provide answers that reflect a deep understanding of the field, guaranteeing their correctness.
|
| 141 |
+
# For any non-English queries, respond in the same language as the prompt unless otherwise specified by the user.
|
| 142 |
+
# For prompts involving reasoning, provide a clear explanation of each step in the reasoning process before presenting the final answer.
|
| 143 |
+
|
| 144 |
+
# [Task]
|
| 145 |
+
# Your task is to answer questions accurately and concisely based on the context provided. You will strictly discuss topics within the scope of the provided context, including its activities, history, services, and impacts.
|
| 146 |
+
# Assess whether the user’s question aligns with the context or the specified areas of interest.
|
| 147 |
+
# If the information requested falls outside the provided context or scope, politely decline the request and advise the user to ask relevant questions within the given context.
|
| 148 |
+
# Only provide answers based on the information available in the given context and the specified areas of interest.
|
| 149 |
+
|
| 150 |
+
# The answer should look like:
|
| 151 |
+
# Example: Artificial neural networks were inspired by information processing and distributed communication nodes in biological systems.
|
| 152 |
+
# ANNs have various differences from biological brains. Specifically, artificial neural networks tend to be static and symbolic,
|
| 153 |
+
# while the biological brain of most living organisms is dynamic (plastic) and analog (vec-001). However, ANNs have demonstrated
|
| 154 |
+
# significant capabilities in tasks such as pattern recognition, classification, and prediction, making them valuable tools in various fields
|
| 155 |
+
# (vec-002). Despite their effectiveness in certain applications, ANNs still lack the complexity and adaptability of biological brains,
|
| 156 |
+
# leading some researchers to view them as low-quality models for brain function (vec-003). Furthermore, advancements in neuroscience
|
| 157 |
+
# continue to provide insights into the workings of the brain, which may further differentiate artificial neural networks from their biological counterparts
|
| 158 |
+
# (vec-004). However, ANNs remain a powerful computational tool, offering practical solutions to a wide range of problems, albeit with inherent limitations
|
| 159 |
+
# compared to biological systems (vec-005).
|
| 160 |
+
# Make sure your answer contains source_id in the given format.
|
| 161 |
+
|
| 162 |
+
# Output should follow the pattern defined in the schema, and the output should be in JSON format only, so that it can be directly used with `json.loads()`.
|
| 163 |
+
|
| 164 |
+
# Steps:
|
| 165 |
+
# 1. Use the provided data.
|
| 166 |
+
# 2. Check if the question is related to the context or related to the provided scope. If the question is not related, politely ask the user to ask relevant questions. Do not proceed with other steps and use the wrong_question_schema to output the answer.
|
| 167 |
+
# 3. Use one source and extract the sentences from it that justify the answer. All source sentences must be unique.
|
| 168 |
+
# 4. Use the sentences to generate the answer and append its source_id to the answer with prefix "vec-". You must append its source_id only if it exists. Do not hallucinate or add from your imagination. Only add source_id which you get from the context. This is a strict requirement.
|
| 169 |
+
# 5. Use other sources and extract the sentences from them that justify the answer. NOTE: Use at least 3 sources to generate the answer.
|
| 170 |
+
# 6. If the user has asked a question related to the context or scope, use the entire answer and the user's question to generate 3 related questions. If the question is outside the context, do not generate related questions.
|
| 171 |
+
|
| 172 |
+
# NOTE: Ensure that no out-of-bounds questions or questions outside the context are answered. It is mandatory that no out-of-context questions are addressed.
|
| 173 |
+
# """
|
| 174 |
+
|
| 175 |
+
# # Instantiate the Gemini RAG assistant
|
| 176 |
+
# gemini_rag_assistant = GeminiAssistant(
|
| 177 |
+
# model=model_name,
|
| 178 |
+
# api_key=api_key,
|
| 179 |
+
# system_prompt=prompt,
|
| 180 |
+
# output_model=GenerativeAnswer,
|
| 181 |
+
# )
|
| 182 |
+
|
| 183 |
+
import os
|
| 184 |
+
from typing import List
|
| 185 |
+
|
| 186 |
+
from dotenv import load_dotenv
|
| 187 |
+
from pydantic import BaseModel, Field
|
| 188 |
+
|
| 189 |
+
from .gemini_assistant import GeminiAssistant
|
| 190 |
+
|
| 191 |
+
# Load environment variables
|
| 192 |
+
load_dotenv()
|
| 193 |
+
|
| 194 |
+
# Load the API key from the environment
|
| 195 |
+
api_key = os.getenv("GEMINI_API_KEY")
|
| 196 |
+
model_name = "gemini-2.0-flash-exp"
|
| 197 |
+
|
| 198 |
+
|
| 199 |
+
# Define the output model schema using Pydantic
|
| 200 |
+
class GenerativeAnswer(BaseModel):
|
| 201 |
+
answer: str = Field(
|
| 202 |
+
...,
|
| 203 |
+
description="""The answer to the user question, which is based only on the given sources.
|
| 204 |
+
Example:
|
| 205 |
+
Artificial neural networks were inspired by information processing and distributed communication nodes in biological systems.
|
| 206 |
+
ANNs have various differences from biological brains. Specifically, artificial neural networks tend to be static and symbolic,
|
| 207 |
+
while the biological brain of most living organisms is dynamic (plastic) and analog (vec-001). However, ANNs have demonstrated
|
| 208 |
+
significant capabilities in tasks such as pattern recognition, classification, and prediction, making them valuable tools in various fields
|
| 209 |
+
(vec-002). Despite their effectiveness in certain applications, ANNs still lack the complexity and adaptability of biological brains,
|
| 210 |
+
leading some researchers to view them as low-quality models for brain function (vec-003). Furthermore, advancements in neuroscience
|
| 211 |
+
continue to provide insights into the workings of the brain, which may further differentiate artificial neural networks from their biological counterparts
|
| 212 |
+
(vec-004). However, ANNs remain a powerful computational tool, offering practical solutions to a wide range of problems, albeit with inherent limitations
|
| 213 |
+
compared to biological systems (vec-005).""",
|
| 214 |
+
)
|
| 215 |
+
related_questions: List[str] = Field(
|
| 216 |
+
...,
|
| 217 |
+
description="Related questions based on the user question and generated answer.",
|
| 218 |
+
)
|
| 219 |
+
|
| 220 |
+
|
| 221 |
+
# Define the system prompt using Langchain's PromptTemplate
|
| 222 |
+
prompt_template = """
|
| 223 |
+
You are a friendly and helpful assistant.
|
| 224 |
+
Ensure your answers are complete, unless the user requests a more concise approach.
|
| 225 |
+
When generating code, offer explanations for code segments as necessary and maintain good coding practices.
|
| 226 |
+
When presented with inquiries seeking information, provide answers that reflect a deep understanding of the field, guaranteeing their correctness.
|
| 227 |
+
For any non-English queries, respond in the same language as the prompt unless otherwise specified by the user.
|
| 228 |
+
For prompts involving reasoning, provide a clear explanation of each step in the reasoning process before presenting the final answer.
|
| 229 |
+
|
| 230 |
+
[Task]
|
| 231 |
+
Your task is to answer questions accurately and concisely based on the context provided. You will strictly discuss topics within the scope of the provided context, including its activities, history, services, and impacts.
|
| 232 |
+
Assess whether the user’s question aligns with the context or the specified areas of interest.
|
| 233 |
+
If the information requested falls outside the provided context or scope, politely decline the request and advise the user to ask relevant questions within the given context.
|
| 234 |
+
Only provide answers based on the information available in the given context and the specified areas of interest.
|
| 235 |
+
|
| 236 |
+
The answer should look like:
|
| 237 |
+
Example: Artificial neural networks were inspired by information processing and distributed communication nodes in biological systems.
|
| 238 |
+
ANNs have various differences from biological brains. Specifically, artificial neural networks tend to be static and symbolic,
|
| 239 |
+
while the biological brain of most living organisms is dynamic (plastic) and analog (vec-001). However, ANNs have demonstrated
|
| 240 |
+
significant capabilities in tasks such as pattern recognition, classification, and prediction, making them valuable tools in various fields
|
| 241 |
+
(vec-002). Despite their effectiveness in certain applications, ANNs still lack the complexity and adaptability of biological brains,
|
| 242 |
+
leading some researchers to view them as low-quality models for brain function (vec-003). Furthermore, advancements in neuroscience
|
| 243 |
+
continue to provide insights into the workings of the brain, which may further differentiate artificial neural networks from their biological counterparts
|
| 244 |
+
(vec-004). However, ANNs remain a powerful computational tool, offering practical solutions to a wide range of problems, albeit with inherent limitations
|
| 245 |
+
compared to biological systems (vec-005).
|
| 246 |
+
|
| 247 |
+
The final output should be structured like this:
|
| 248 |
+
1. The complete answer to the user question.
|
| 249 |
+
2. The source IDs appended to the answer, e.g., (vec-001, vec-002, vec-003).
|
| 250 |
+
3. Related questions that might arise from the user's question.
|
| 251 |
+
|
| 252 |
+
Output should be in the following format:
|
| 253 |
+
{
|
| 254 |
+
"answer": "<generated_answer_with_sources>",
|
| 255 |
+
"related_questions": [<related_question_1>, <related_question_2>, <related_question_3>]
|
| 256 |
+
}
|
| 257 |
+
"""
|
| 258 |
+
|
| 259 |
+
|
| 260 |
+
# Instantiate the Gemini RAG assistant with the prompt template and output parser
|
| 261 |
+
gemini_rag_assistant = GeminiAssistant(
|
| 262 |
+
model=model_name,
|
| 263 |
+
api_key=api_key,
|
| 264 |
+
system_prompt=prompt_template,
|
| 265 |
+
output_model=GenerativeAnswer,
|
| 266 |
+
)
|
src/assistants/gemini_assistant.py
ADDED
|
@@ -0,0 +1,481 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# import json
|
| 2 |
+
# import os
|
| 3 |
+
# from dataclasses import dataclass
|
| 4 |
+
# from enum import Enum
|
| 5 |
+
# from typing import Any, Dict, List, Optional, Union
|
| 6 |
+
|
| 7 |
+
# import google.generativeai as genai
|
| 8 |
+
# from pydantic import BaseModel
|
| 9 |
+
|
| 10 |
+
|
| 11 |
+
# class Role(Enum):
|
| 12 |
+
# USER = "user"
|
| 13 |
+
# ASSISTANT = "assistant"
|
| 14 |
+
# SYSTEM = "system"
|
| 15 |
+
|
| 16 |
+
|
| 17 |
+
# @dataclass
|
| 18 |
+
# class Message:
|
| 19 |
+
# role: Role
|
| 20 |
+
# content: str
|
| 21 |
+
|
| 22 |
+
|
| 23 |
+
# class GeminiAssistant:
|
| 24 |
+
# def __init__(
|
| 25 |
+
# self,
|
| 26 |
+
# api_key: str,
|
| 27 |
+
# model: str = "gemini-pro",
|
| 28 |
+
# temperature: float = 0.7,
|
| 29 |
+
# top_p: float = 0.95,
|
| 30 |
+
# top_k: int = 40,
|
| 31 |
+
# max_output_tokens: int = 2048,
|
| 32 |
+
# system_prompt: Optional[str] = None,
|
| 33 |
+
# output_model: Optional[type[BaseModel]] = None,
|
| 34 |
+
# ):
|
| 35 |
+
# genai.configure(api_key=api_key)
|
| 36 |
+
|
| 37 |
+
# self.model = genai.GenerativeModel(
|
| 38 |
+
# model_name=model,
|
| 39 |
+
# system_instruction=system_prompt,
|
| 40 |
+
# generation_config={
|
| 41 |
+
# "temperature": temperature,
|
| 42 |
+
# "top_p": top_p,
|
| 43 |
+
# "top_k": top_k,
|
| 44 |
+
# "max_output_tokens": max_output_tokens,
|
| 45 |
+
# "response_mime_type": "application/json",
|
| 46 |
+
# },
|
| 47 |
+
# )
|
| 48 |
+
|
| 49 |
+
# self.system_prompt = system_prompt
|
| 50 |
+
# self.output_model = output_model
|
| 51 |
+
# self.conversation_history: List[Message] = []
|
| 52 |
+
# if system_prompt:
|
| 53 |
+
# self.conversation_history.append(
|
| 54 |
+
# Message(role=Role.SYSTEM, content=system_prompt)
|
| 55 |
+
# )
|
| 56 |
+
|
| 57 |
+
# def add_message(self, role: Union[Role, str], content: str) -> None:
|
| 58 |
+
# if isinstance(role, str):
|
| 59 |
+
# role = Role(role)
|
| 60 |
+
# self.conversation_history.append(Message(role=role, content=content))
|
| 61 |
+
|
| 62 |
+
# def _format_messages_for_gemini(self) -> List[Dict]:
|
| 63 |
+
# """Convert internal message format to Gemini's expected format."""
|
| 64 |
+
# formatted_history = []
|
| 65 |
+
|
| 66 |
+
# for msg in self.conversation_history:
|
| 67 |
+
# if msg.role == Role.USER:
|
| 68 |
+
# formatted_history.append(
|
| 69 |
+
# {"role": "user", "parts": [{"text": msg.content}]}
|
| 70 |
+
# )
|
| 71 |
+
# elif msg.role == Role.ASSISTANT:
|
| 72 |
+
# formatted_history.append(
|
| 73 |
+
# {"role": "model", "parts": [{"text": msg.content}]}
|
| 74 |
+
# )
|
| 75 |
+
# # System messages are handled differently
|
| 76 |
+
|
| 77 |
+
# return formatted_history
|
| 78 |
+
|
| 79 |
+
# async def get_response(
|
| 80 |
+
# self,
|
| 81 |
+
# message: str,
|
| 82 |
+
# context_data: Optional[str] = None,
|
| 83 |
+
# ) -> Union[str, BaseModel]:
|
| 84 |
+
# """Get a response from the assistant for the given message."""
|
| 85 |
+
# # Prepare the full message with system prompt if it exists
|
| 86 |
+
# full_message = message
|
| 87 |
+
# if self.system_prompt and not self.conversation_history:
|
| 88 |
+
# full_message = (
|
| 89 |
+
# f"{self.system_prompt}\n\n{message}\n\nContext:\n{context_data}\n"
|
| 90 |
+
# )
|
| 91 |
+
# else:
|
| 92 |
+
|
| 93 |
+
# full_message = f"{message}\n\nContext:\n{context_data}\n"
|
| 94 |
+
|
| 95 |
+
# print("********************")
|
| 96 |
+
# print(full_message)
|
| 97 |
+
# print("********************")
|
| 98 |
+
|
| 99 |
+
# # Add user message to history
|
| 100 |
+
# self.add_message(Role.USER, full_message)
|
| 101 |
+
|
| 102 |
+
# try:
|
| 103 |
+
# # Create chat session with properly formatted history
|
| 104 |
+
# chat = self.model.start_chat(history=self._format_messages_for_gemini())
|
| 105 |
+
|
| 106 |
+
# # Generate response
|
| 107 |
+
# response = await chat.send_message_async(full_message)
|
| 108 |
+
# response_text = response.text
|
| 109 |
+
|
| 110 |
+
# # Handle structured output if output_model is specified
|
| 111 |
+
# if self.output_model:
|
| 112 |
+
# try:
|
| 113 |
+
# # Try to parse the response as JSON
|
| 114 |
+
# response_data = json.loads(response_text)
|
| 115 |
+
# # Validate with Pydantic model
|
| 116 |
+
# structured_response = self.output_model(**response_data)
|
| 117 |
+
# # Add the formatted response to history
|
| 118 |
+
# self.add_message(
|
| 119 |
+
# Role.ASSISTANT, json.dumps(response_data, indent=2)
|
| 120 |
+
# )
|
| 121 |
+
# return structured_response
|
| 122 |
+
# except (json.JSONDecodeError, ValueError) as e:
|
| 123 |
+
# # If parsing fails, try to fix the response format
|
| 124 |
+
# retry_prompt = (
|
| 125 |
+
# "Please format your previous response as a valid JSON object "
|
| 126 |
+
# "that matches the specified schema. Include all required fields."
|
| 127 |
+
# )
|
| 128 |
+
# response = await chat.send_message_async(retry_prompt)
|
| 129 |
+
# try:
|
| 130 |
+
# print(response.text)
|
| 131 |
+
# response_data = json.loads(response.text)
|
| 132 |
+
# structured_response = self.output_model(**response_data)
|
| 133 |
+
# self.add_message(
|
| 134 |
+
# Role.ASSISTANT, json.dumps(response_data, indent=2)
|
| 135 |
+
# )
|
| 136 |
+
# return structured_response
|
| 137 |
+
# except (json.JSONDecodeError, ValueError) as e:
|
| 138 |
+
# raise ValueError(
|
| 139 |
+
# f"Failed to generate properly formatted response: {str(e)}"
|
| 140 |
+
# )
|
| 141 |
+
# else:
|
| 142 |
+
# # Handle regular string response
|
| 143 |
+
# self.add_message(Role.ASSISTANT, response_text)
|
| 144 |
+
# return response_text
|
| 145 |
+
|
| 146 |
+
# except Exception as e:
|
| 147 |
+
# raise Exception(f"Error generating response: {str(e)}")
|
| 148 |
+
|
| 149 |
+
# def clear_history(self) -> None:
|
| 150 |
+
# """Clear the conversation history."""
|
| 151 |
+
# self.conversation_history = []
|
| 152 |
+
# if self.system_prompt:
|
| 153 |
+
# self.add_message(Role.SYSTEM, self.system_prompt)
|
| 154 |
+
|
| 155 |
+
|
| 156 |
+
# # Example usage
|
| 157 |
+
# async def main():
|
| 158 |
+
# # Initialize assistant
|
| 159 |
+
# assistant = GeminiAssistant(
|
| 160 |
+
# api_key="your-api-key-here", system_prompt="You are a helpful AI assistant."
|
| 161 |
+
# )
|
| 162 |
+
|
| 163 |
+
# # Get response
|
| 164 |
+
# response = await assistant.get_response("What is machine learning?")
|
| 165 |
+
# print(response)
|
| 166 |
+
|
| 167 |
+
# # Continue conversation
|
| 168 |
+
# response = await assistant.get_response("Can you give me an example?")
|
| 169 |
+
# print(response)
|
| 170 |
+
|
| 171 |
+
|
| 172 |
+
# if __name__ == "__main__":
|
| 173 |
+
# import asyncio
|
| 174 |
+
|
| 175 |
+
# asyncio.run(main())
|
| 176 |
+
|
| 177 |
+
|
| 178 |
+
# import json
|
| 179 |
+
# import os
|
| 180 |
+
# from dataclasses import dataclass
|
| 181 |
+
# from enum import Enum
|
| 182 |
+
# from typing import Any, Dict, List, Optional, Union
|
| 183 |
+
|
| 184 |
+
# import google.generativeai as genai
|
| 185 |
+
# from pydantic import BaseModel
|
| 186 |
+
|
| 187 |
+
|
| 188 |
+
# class Role(Enum):
|
| 189 |
+
# USER = "user"
|
| 190 |
+
# ASSISTANT = "assistant"
|
| 191 |
+
# SYSTEM = "system"
|
| 192 |
+
|
| 193 |
+
|
| 194 |
+
# @dataclass
|
| 195 |
+
# class Message:
|
| 196 |
+
# role: Role
|
| 197 |
+
# content: str
|
| 198 |
+
|
| 199 |
+
|
| 200 |
+
# class GeminiAssistant:
|
| 201 |
+
# def __init__(
|
| 202 |
+
# self,
|
| 203 |
+
# api_key: str,
|
| 204 |
+
# model: str = "gemini-pro",
|
| 205 |
+
# temperature: float = 0.7,
|
| 206 |
+
# top_p: float = 0.95,
|
| 207 |
+
# top_k: int = 40,
|
| 208 |
+
# max_output_tokens: int = 2048,
|
| 209 |
+
# system_prompt: Optional[str] = None,
|
| 210 |
+
# output_model: Optional[type[BaseModel]] = None,
|
| 211 |
+
# ):
|
| 212 |
+
# genai.configure(api_key=api_key)
|
| 213 |
+
|
| 214 |
+
# self.model = genai.GenerativeModel(
|
| 215 |
+
# model_name=model,
|
| 216 |
+
# system_instruction=system_prompt,
|
| 217 |
+
# generation_config={
|
| 218 |
+
# "temperature": temperature,
|
| 219 |
+
# "top_p": top_p,
|
| 220 |
+
# "top_k": top_k,
|
| 221 |
+
# "max_output_tokens": max_output_tokens,
|
| 222 |
+
# "response_mime_type": "application/json",
|
| 223 |
+
# },
|
| 224 |
+
# )
|
| 225 |
+
|
| 226 |
+
# self.system_prompt = system_prompt
|
| 227 |
+
# self.output_model = output_model
|
| 228 |
+
# self.conversation_history: List[Message] = []
|
| 229 |
+
# if system_prompt:
|
| 230 |
+
# self.conversation_history.append(
|
| 231 |
+
# Message(role=Role.SYSTEM, content=system_prompt)
|
| 232 |
+
# )
|
| 233 |
+
|
| 234 |
+
# def add_message(self, role: Union[Role, str], content: str) -> None:
|
| 235 |
+
# if isinstance(role, str):
|
| 236 |
+
# role = Role(role)
|
| 237 |
+
# self.conversation_history.append(Message(role=role, content=content))
|
| 238 |
+
|
| 239 |
+
# def _format_messages_for_gemini(self) -> List[Dict]:
|
| 240 |
+
# """Convert internal message format to Gemini's expected format."""
|
| 241 |
+
# formatted_history = []
|
| 242 |
+
|
| 243 |
+
# for msg in self.conversation_history:
|
| 244 |
+
# if msg.role == Role.USER:
|
| 245 |
+
# formatted_history.append(
|
| 246 |
+
# {"role": "user", "parts": [{"text": msg.content}]}
|
| 247 |
+
# )
|
| 248 |
+
# elif msg.role == Role.ASSISTANT:
|
| 249 |
+
# formatted_history.append(
|
| 250 |
+
# {"role": "model", "parts": [{"text": msg.content}]}
|
| 251 |
+
# )
|
| 252 |
+
# # System messages are handled differently
|
| 253 |
+
|
| 254 |
+
# return formatted_history
|
| 255 |
+
|
| 256 |
+
# async def get_response_gemini(
|
| 257 |
+
# self,
|
| 258 |
+
# message: str,
|
| 259 |
+
# context_data: Optional[str] = None,
|
| 260 |
+
# ) -> Union[str, BaseModel]:
|
| 261 |
+
# """Get a response from the assistant for the given message."""
|
| 262 |
+
# # Prepare the full message with system prompt if it exists
|
| 263 |
+
# full_message = message
|
| 264 |
+
# if self.system_prompt and not self.conversation_history:
|
| 265 |
+
# full_message = (
|
| 266 |
+
# f"{self.system_prompt}\n\n{message}\n\nContext:\n{context_data}\n"
|
| 267 |
+
# )
|
| 268 |
+
# else:
|
| 269 |
+
# full_message = f"{message}\n\nContext:\n{context_data}\n"
|
| 270 |
+
|
| 271 |
+
# print("********************")
|
| 272 |
+
# print(full_message)
|
| 273 |
+
# print("********************")
|
| 274 |
+
|
| 275 |
+
# # Add user message to history
|
| 276 |
+
# self.add_message(Role.USER, full_message)
|
| 277 |
+
|
| 278 |
+
# try:
|
| 279 |
+
# # Create chat session with properly formatted history
|
| 280 |
+
# chat = self.model.start_chat(history=self._format_messages_for_gemini())
|
| 281 |
+
|
| 282 |
+
# # Generate response
|
| 283 |
+
# response = await chat.send_message_async(full_message)
|
| 284 |
+
# response_text = response.text
|
| 285 |
+
|
| 286 |
+
# # Handle structured output if output_model is specified
|
| 287 |
+
# if self.output_model:
|
| 288 |
+
# try:
|
| 289 |
+
# # Try to parse the response as JSON
|
| 290 |
+
# response_data = json.loads(response_text)
|
| 291 |
+
|
| 292 |
+
# # Ensure required fields are present
|
| 293 |
+
# if "source_id" not in response_data:
|
| 294 |
+
# response_data["source_id"] = "default_source_id"
|
| 295 |
+
# if "quote" not in response_data:
|
| 296 |
+
# response_data["quote"] = "default_quote"
|
| 297 |
+
|
| 298 |
+
# # Validate with Pydantic model
|
| 299 |
+
# structured_response = self.output_model(**response_data)
|
| 300 |
+
# # Add the formatted response to history
|
| 301 |
+
# self.add_message(
|
| 302 |
+
# Role.ASSISTANT, json.dumps(response_data, indent=2)
|
| 303 |
+
# )
|
| 304 |
+
# return structured_response
|
| 305 |
+
# except (json.JSONDecodeError, ValueError) as e:
|
| 306 |
+
# # If parsing fails, try to fix the response format
|
| 307 |
+
# retry_prompt = (
|
| 308 |
+
# "Please format your previous response as a valid JSON object "
|
| 309 |
+
# "that matches the specified schema. Include all required fields."
|
| 310 |
+
# )
|
| 311 |
+
# response = await chat.send_message_async(retry_prompt)
|
| 312 |
+
# try:
|
| 313 |
+
# print(response.text)
|
| 314 |
+
# response_data = json.loads(response.text)
|
| 315 |
+
|
| 316 |
+
# # Ensure required fields are present
|
| 317 |
+
# if "source_id" not in response_data:
|
| 318 |
+
# response_data["source_id"] = "default_source_id"
|
| 319 |
+
# if "quote" not in response_data:
|
| 320 |
+
# response_data["quote"] = "default_quote"
|
| 321 |
+
|
| 322 |
+
# structured_response = self.output_model(**response_data)
|
| 323 |
+
# self.add_message(
|
| 324 |
+
# Role.ASSISTANT, json.dumps(response_data, indent=2)
|
| 325 |
+
# )
|
| 326 |
+
# return structured_response
|
| 327 |
+
# except (json.JSONDecodeError, ValueError) as e:
|
| 328 |
+
# raise ValueError(
|
| 329 |
+
# f"Failed to generate properly formatted response: {str(e)}"
|
| 330 |
+
# )
|
| 331 |
+
# else:
|
| 332 |
+
# # Handle regular string response
|
| 333 |
+
# self.add_message(Role.ASSISTANT, response_text)
|
| 334 |
+
# return response_text
|
| 335 |
+
|
| 336 |
+
# except Exception as e:
|
| 337 |
+
# raise Exception(f"Error generating response: {str(e)}")
|
| 338 |
+
|
| 339 |
+
# def clear_history(self) -> None:
|
| 340 |
+
# """Clear the conversation history."""
|
| 341 |
+
# self.conversation_history = []
|
| 342 |
+
# if self.system_prompt:
|
| 343 |
+
# self.add_message(Role.SYSTEM, self.system_prompt)
|
| 344 |
+
|
| 345 |
+
|
| 346 |
+
import json
|
| 347 |
+
from dataclasses import dataclass
|
| 348 |
+
from enum import Enum
|
| 349 |
+
from typing import Any, Dict, List, Optional, Union
|
| 350 |
+
|
| 351 |
+
import google.generativeai as genai
|
| 352 |
+
from pydantic import BaseModel
|
| 353 |
+
|
| 354 |
+
|
| 355 |
+
class Role(Enum):
|
| 356 |
+
USER = "user"
|
| 357 |
+
ASSISTANT = "assistant"
|
| 358 |
+
SYSTEM = "system"
|
| 359 |
+
|
| 360 |
+
|
| 361 |
+
@dataclass
|
| 362 |
+
class Message:
|
| 363 |
+
role: Role
|
| 364 |
+
content: str
|
| 365 |
+
|
| 366 |
+
|
| 367 |
+
class GeminiAssistant:
|
| 368 |
+
def __init__(
|
| 369 |
+
self,
|
| 370 |
+
api_key: str,
|
| 371 |
+
model: str = "gemini-pro",
|
| 372 |
+
temperature: float = 0.7,
|
| 373 |
+
top_p: float = 0.95,
|
| 374 |
+
top_k: int = 40,
|
| 375 |
+
max_output_tokens: int = 2048,
|
| 376 |
+
system_prompt: Optional[str] = None,
|
| 377 |
+
output_model: Optional[type[BaseModel]] = None,
|
| 378 |
+
):
|
| 379 |
+
genai.configure(api_key=api_key)
|
| 380 |
+
|
| 381 |
+
self.model = genai.GenerativeModel(
|
| 382 |
+
model_name=model,
|
| 383 |
+
system_instruction=system_prompt,
|
| 384 |
+
generation_config={
|
| 385 |
+
"temperature": temperature,
|
| 386 |
+
"top_p": top_p,
|
| 387 |
+
"top_k": top_k,
|
| 388 |
+
"max_output_tokens": max_output_tokens,
|
| 389 |
+
"response_mime_type": "application/json",
|
| 390 |
+
},
|
| 391 |
+
)
|
| 392 |
+
|
| 393 |
+
self.system_prompt = system_prompt
|
| 394 |
+
self.output_model = output_model
|
| 395 |
+
self.conversation_history: List[Message] = []
|
| 396 |
+
if system_prompt:
|
| 397 |
+
self.conversation_history.append(
|
| 398 |
+
Message(role=Role.SYSTEM, content=system_prompt)
|
| 399 |
+
)
|
| 400 |
+
|
| 401 |
+
def add_message(self, role: Union[Role, str], content: str) -> None:
|
| 402 |
+
if isinstance(role, str):
|
| 403 |
+
role = Role(role)
|
| 404 |
+
self.conversation_history.append(Message(role=role, content=content))
|
| 405 |
+
|
| 406 |
+
def _format_messages_for_gemini(self) -> List[Dict]:
|
| 407 |
+
"""Convert internal message format to Gemini's expected format."""
|
| 408 |
+
formatted_history = []
|
| 409 |
+
|
| 410 |
+
for msg in self.conversation_history:
|
| 411 |
+
if msg.role == Role.USER:
|
| 412 |
+
formatted_history.append(
|
| 413 |
+
{"role": "user", "parts": [{"text": msg.content}]}
|
| 414 |
+
)
|
| 415 |
+
elif msg.role == Role.ASSISTANT:
|
| 416 |
+
formatted_history.append(
|
| 417 |
+
{"role": "model", "parts": [{"text": msg.content}]}
|
| 418 |
+
)
|
| 419 |
+
|
| 420 |
+
return formatted_history
|
| 421 |
+
|
| 422 |
+
def get_response_gemini(
|
| 423 |
+
self,
|
| 424 |
+
message: str,
|
| 425 |
+
context_data: Optional[str] = None,
|
| 426 |
+
) -> Dict[str, Any]:
|
| 427 |
+
"""Get a response from the assistant for the given message."""
|
| 428 |
+
full_message = message
|
| 429 |
+
if self.system_prompt and not self.conversation_history:
|
| 430 |
+
full_message = (
|
| 431 |
+
f"{self.system_prompt}\n\n{message}\n\nContext:\n{context_data}\n"
|
| 432 |
+
)
|
| 433 |
+
else:
|
| 434 |
+
full_message = f"{message}\n\nContext:\n{context_data}\n"
|
| 435 |
+
|
| 436 |
+
# Add user message to history
|
| 437 |
+
self.add_message(Role.USER, full_message)
|
| 438 |
+
|
| 439 |
+
try:
|
| 440 |
+
# Create chat session with properly formatted history
|
| 441 |
+
chat = self.model.start_chat(history=self._format_messages_for_gemini())
|
| 442 |
+
|
| 443 |
+
# Generate response
|
| 444 |
+
response = chat.send_message(full_message)
|
| 445 |
+
response_text = response.text
|
| 446 |
+
|
| 447 |
+
# Parse response as JSON
|
| 448 |
+
try:
|
| 449 |
+
response_data = json.loads(response_text)
|
| 450 |
+
|
| 451 |
+
# Ensure required fields are present
|
| 452 |
+
if "source_id" not in response_data:
|
| 453 |
+
response_data["source_id"] = "default_source_id"
|
| 454 |
+
if "quote" not in response_data:
|
| 455 |
+
response_data["quote"] = "default_quote"
|
| 456 |
+
|
| 457 |
+
# Validate with Pydantic model if specified
|
| 458 |
+
if self.output_model:
|
| 459 |
+
structured_response = self.output_model(**response_data)
|
| 460 |
+
self.add_message(
|
| 461 |
+
Role.ASSISTANT, json.dumps(response_data, indent=2)
|
| 462 |
+
)
|
| 463 |
+
return structured_response.dict()
|
| 464 |
+
|
| 465 |
+
# Add response to history and return as dict
|
| 466 |
+
self.add_message(Role.ASSISTANT, response_text)
|
| 467 |
+
return response_data
|
| 468 |
+
|
| 469 |
+
except (json.JSONDecodeError, ValueError) as e:
|
| 470 |
+
raise ValueError(
|
| 471 |
+
f"Failed to parse response as JSON: {str(e)}. Response: {response_text}"
|
| 472 |
+
)
|
| 473 |
+
|
| 474 |
+
except Exception as e:
|
| 475 |
+
raise Exception(f"Error generating response: {str(e)}")
|
| 476 |
+
|
| 477 |
+
def clear_history(self) -> None:
|
| 478 |
+
"""Clear the conversation history."""
|
| 479 |
+
self.conversation_history = []
|
| 480 |
+
if self.system_prompt:
|
| 481 |
+
self.add_message(Role.SYSTEM, self.system_prompt)
|
src/chroma_rag/chroma.sqlite3
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:b91a33bf0cab01f5a6e14731ab350378bc396ef1ee7d6c339b279327f33ac337
|
| 3 |
+
size 30126080
|
src/chroma_rag/e8532e8c-980b-42a2-acd8-df1effcd172a/data_level0.bin
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:a13e72541800c513c73dccea69f79e39cf4baef4fa23f7e117c0d6b0f5f99670
|
| 3 |
+
size 3212000
|
src/chroma_rag/e8532e8c-980b-42a2-acd8-df1effcd172a/header.bin
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:0ec6df10978b056a10062ed99efeef2702fa4a1301fad702b53dd2517103c746
|
| 3 |
+
size 100
|
src/chroma_rag/e8532e8c-980b-42a2-acd8-df1effcd172a/length.bin
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:3d7d015a7ab0459c0f4812e91eb55167f930ea9df777241555219a13bc4f9512
|
| 3 |
+
size 4000
|
src/chroma_rag/e8532e8c-980b-42a2-acd8-df1effcd172a/link_lists.bin
ADDED
|
File without changes
|
src/notebooks/read_pdf.py
ADDED
|
@@ -0,0 +1,15 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import fitz # PyMuPDF
|
| 2 |
+
|
| 3 |
+
# Load the PDF file
|
| 4 |
+
pdf_path = "../data/Artificial.Intelligence.A.Modern.Approach.4th.Edition.Peter.Norvig. Stuart.Russell.Pearson.9780134610993.EBooksWorld.ir.pdf"
|
| 5 |
+
doc = fitz.open(pdf_path)
|
| 6 |
+
|
| 7 |
+
# Extract text from each page
|
| 8 |
+
for page_number in range(len(doc)):
|
| 9 |
+
page = doc[page_number]
|
| 10 |
+
print(f"Page {page_number + 1}:")
|
| 11 |
+
print(page.get_text())
|
| 12 |
+
print("-" * 50)
|
| 13 |
+
|
| 14 |
+
# Close the document
|
| 15 |
+
doc.close()
|
src/notebooks/temp.py
ADDED
|
@@ -0,0 +1,58 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import fitz # PyMuPDF
|
| 2 |
+
import pytesseract
|
| 3 |
+
from PIL import Image
|
| 4 |
+
|
| 5 |
+
|
| 6 |
+
# Define a function to extract text from all pages of a PDF
|
| 7 |
+
def extract_text_from_pdf(pdf_path, dpi=300):
|
| 8 |
+
"""
|
| 9 |
+
Extract text from all pages of a PDF.
|
| 10 |
+
|
| 11 |
+
Args:
|
| 12 |
+
pdf_path (str): Path to the PDF file.
|
| 13 |
+
dpi (int): Resolution for converting PDF pages to images (default: 300).
|
| 14 |
+
|
| 15 |
+
Returns:
|
| 16 |
+
dict: A dictionary where keys are page numbers (1-based) and values are extracted text.
|
| 17 |
+
"""
|
| 18 |
+
# Open the PDF file
|
| 19 |
+
pdf_document = fitz.open(pdf_path)
|
| 20 |
+
extracted_text = {}
|
| 21 |
+
|
| 22 |
+
for page_number in range(len(pdf_document)):
|
| 23 |
+
# Select the page
|
| 24 |
+
page = pdf_document[page_number]
|
| 25 |
+
|
| 26 |
+
# Convert the page to an image
|
| 27 |
+
pixmap = page.get_pixmap(dpi=dpi)
|
| 28 |
+
|
| 29 |
+
# Save the image to a temporary file
|
| 30 |
+
image = Image.frombytes("RGB", [pixmap.width, pixmap.height], pixmap.samples)
|
| 31 |
+
|
| 32 |
+
# Extract text from the image using Tesseract
|
| 33 |
+
text = pytesseract.image_to_string(image)
|
| 34 |
+
|
| 35 |
+
print(text)
|
| 36 |
+
|
| 37 |
+
# Store the text in the dictionary
|
| 38 |
+
extracted_text[page_number + 1] = text
|
| 39 |
+
|
| 40 |
+
# Close the PDF document
|
| 41 |
+
pdf_document.close()
|
| 42 |
+
|
| 43 |
+
return extracted_text
|
| 44 |
+
|
| 45 |
+
|
| 46 |
+
# Usage example
|
| 47 |
+
if __name__ == "__main__":
|
| 48 |
+
# pdf_path = "c:/Abhi-MTech/Sem-1/AI/Books/Artificial.Intelligence.A.Modern.Approach.4th.Edition.Peter.Norvig. Stuart.Russell.Pearson.9780134610993.EBooksWorld.ir.pdf" # Path to your PDF file
|
| 49 |
+
pdf_path = "c:/Abhi-MTech/Sem-1/AI/AI Technical.pdf" # Path to your PDF file
|
| 50 |
+
|
| 51 |
+
try:
|
| 52 |
+
all_text = extract_text_from_pdf(pdf_path)
|
| 53 |
+
for page_num, text in all_text.items():
|
| 54 |
+
print(f"Page {page_num} Text:")
|
| 55 |
+
print(text)
|
| 56 |
+
print("-" * 80) # Separator for readability
|
| 57 |
+
except Exception as e:
|
| 58 |
+
print(f"Error: {e}")
|
src/response_api.py
ADDED
|
@@ -0,0 +1,292 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# import os
|
| 2 |
+
# import sys
|
| 3 |
+
|
| 4 |
+
# sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
|
| 5 |
+
# import json
|
| 6 |
+
|
| 7 |
+
# from pydantic import ValidationError
|
| 8 |
+
# from src.assistants.assistant_v1 import gemini_rag_assistant
|
| 9 |
+
# from src.utils.knowledge_base import AgenticRAG
|
| 10 |
+
|
| 11 |
+
|
| 12 |
+
# async def get_response(query=True, is_uploaded=False):
|
| 13 |
+
# rag = AgenticRAG(query_value=query, is_uploaded=is_uploaded)
|
| 14 |
+
|
| 15 |
+
# # query = input("Enter your query: ")
|
| 16 |
+
# context = rag.query(query_text=query, n_results=10)
|
| 17 |
+
# print("\nQuery Results:")
|
| 18 |
+
# print(json.dumps(context, indent=2))
|
| 19 |
+
|
| 20 |
+
# try:
|
| 21 |
+
# response = await gemini_rag_assistant.get_response(
|
| 22 |
+
# message=query, context_data=context
|
| 23 |
+
# )
|
| 24 |
+
# print("\nAssistant Response:")
|
| 25 |
+
# print(response)
|
| 26 |
+
|
| 27 |
+
# return {"response": response, "context": context}
|
| 28 |
+
|
| 29 |
+
# except ValidationError as e:
|
| 30 |
+
# print("Validation Error:", e)
|
| 31 |
+
# return {
|
| 32 |
+
# "source_id": "validation_error",
|
| 33 |
+
# "content": str(e),
|
| 34 |
+
# }
|
| 35 |
+
|
| 36 |
+
# except Exception as e:
|
| 37 |
+
# print("Internal Server Error:", e)
|
| 38 |
+
# return {
|
| 39 |
+
# "source_id": "internal_error",
|
| 40 |
+
# "content": "Internal Server Error",
|
| 41 |
+
# }
|
| 42 |
+
|
| 43 |
+
|
| 44 |
+
# async def main():
|
| 45 |
+
# rag = AgenticRAG(query_value=True)
|
| 46 |
+
# # while True:
|
| 47 |
+
# query = input("Enter your query: ")
|
| 48 |
+
# results = rag.query(query_text=query, n_results=10)
|
| 49 |
+
# print("\nQuery Results:")
|
| 50 |
+
# print(json.dumps(results, indent=2))
|
| 51 |
+
|
| 52 |
+
# try:
|
| 53 |
+
# print("gemini start generating answer")
|
| 54 |
+
# response = await gemini_rag_assistant.get_response(
|
| 55 |
+
# message=query, context_data=results
|
| 56 |
+
# )
|
| 57 |
+
# print("\nAssistant Response:")
|
| 58 |
+
# print(response)
|
| 59 |
+
|
| 60 |
+
# except ValidationError as e:
|
| 61 |
+
# print("Validation Error:", e)
|
| 62 |
+
# return {
|
| 63 |
+
# "source_id": "validation_error",
|
| 64 |
+
# "content": str(e),
|
| 65 |
+
# }
|
| 66 |
+
|
| 67 |
+
# except Exception as e:
|
| 68 |
+
# print("Internal Server Error:", e)
|
| 69 |
+
# return {
|
| 70 |
+
# "source_id": "internal_error",
|
| 71 |
+
# "content": "Internal Server Error",
|
| 72 |
+
# }
|
| 73 |
+
|
| 74 |
+
|
| 75 |
+
# if __name__ == "__main__":
|
| 76 |
+
# import asyncio
|
| 77 |
+
|
| 78 |
+
# asyncio.run(main())
|
| 79 |
+
|
| 80 |
+
|
| 81 |
+
import json
|
| 82 |
+
import os
|
| 83 |
+
import traceback
|
| 84 |
+
|
| 85 |
+
import google.generativeai as genai
|
| 86 |
+
from dotenv import load_dotenv
|
| 87 |
+
from fastapi import FastAPI, HTTPException
|
| 88 |
+
from pydantic import BaseModel, ValidationError
|
| 89 |
+
|
| 90 |
+
from agents_rag.crew import get_crew_response
|
| 91 |
+
from assistants.assistant_v1 import gemini_rag_assistant
|
| 92 |
+
from utils.knowledge_base import AgenticRAG
|
| 93 |
+
from utils.vectorDB import VectorStore
|
| 94 |
+
|
| 95 |
+
load_dotenv()
|
| 96 |
+
|
| 97 |
+
GEMINI_API_KEY = os.getenv("GEMINI_API_KEY")
|
| 98 |
+
genai.configure(api_key=GEMINI_API_KEY)
|
| 99 |
+
|
| 100 |
+
# Initialize FastAPI app
|
| 101 |
+
app = FastAPI(title="Gemini RAG Assistant API", version="1.0.0")
|
| 102 |
+
|
| 103 |
+
|
| 104 |
+
# Pydantic model for request body
|
| 105 |
+
class QueryRequest(BaseModel):
|
| 106 |
+
query: str
|
| 107 |
+
is_uploaded: bool = False
|
| 108 |
+
url: str
|
| 109 |
+
|
| 110 |
+
|
| 111 |
+
# Pydantic model for response
|
| 112 |
+
class QueryResponse(BaseModel):
|
| 113 |
+
response: str
|
| 114 |
+
context: dict
|
| 115 |
+
|
| 116 |
+
|
| 117 |
+
def llm_answer(query=""):
|
| 118 |
+
try:
|
| 119 |
+
# Initialize Gemini RAG assistant
|
| 120 |
+
model = genai.GenerativeModel(model_name="gemini-2.0-flash-exp")
|
| 121 |
+
|
| 122 |
+
print("query", query)
|
| 123 |
+
|
| 124 |
+
response = model.generate_content(query)
|
| 125 |
+
print(response.text)
|
| 126 |
+
|
| 127 |
+
return {"response": response.text, "status": "success"}
|
| 128 |
+
|
| 129 |
+
except Exception as e:
|
| 130 |
+
print(f"Error in Gemini chunking: {e}")
|
| 131 |
+
return [
|
| 132 |
+
{
|
| 133 |
+
"response": "",
|
| 134 |
+
"status": "fail",
|
| 135 |
+
}
|
| 136 |
+
]
|
| 137 |
+
|
| 138 |
+
|
| 139 |
+
@app.post("/get-response")
|
| 140 |
+
def get_response(request: QueryRequest):
|
| 141 |
+
"""
|
| 142 |
+
Endpoint to process a query and get a response from the assistant.
|
| 143 |
+
"""
|
| 144 |
+
try:
|
| 145 |
+
# Initialize AgenticRAG and fetch context
|
| 146 |
+
rag = AgenticRAG(query_value=request.query, is_uploaded=request.is_uploaded)
|
| 147 |
+
context = rag.query(query_text=request.query, n_results=15)
|
| 148 |
+
|
| 149 |
+
print("Generate answer form gemini")
|
| 150 |
+
# Fetch response from gemini assistant
|
| 151 |
+
# response = gemini_rag_assistant.get_response_gemini(
|
| 152 |
+
# message=request.query, context_data=context
|
| 153 |
+
# )
|
| 154 |
+
|
| 155 |
+
response = get_crew_response(
|
| 156 |
+
query=request.query, context=context, url=request.url
|
| 157 |
+
)
|
| 158 |
+
|
| 159 |
+
print(response)
|
| 160 |
+
|
| 161 |
+
cleaned_text = "".join(
|
| 162 |
+
char for char in response if ord(char) >= 32 or char in "\n\r\t"
|
| 163 |
+
)
|
| 164 |
+
result = json.loads(cleaned_text)
|
| 165 |
+
print(result)
|
| 166 |
+
|
| 167 |
+
result = {
|
| 168 |
+
"response": result["Answer"],
|
| 169 |
+
"context": result["context"],
|
| 170 |
+
"citations": result["citations"],
|
| 171 |
+
}
|
| 172 |
+
# print(result)
|
| 173 |
+
|
| 174 |
+
return result
|
| 175 |
+
|
| 176 |
+
except ValidationError as e:
|
| 177 |
+
raise HTTPException(status_code=422, detail=f"Validation Error: {e}")
|
| 178 |
+
|
| 179 |
+
except ValueError as e:
|
| 180 |
+
raise HTTPException(status_code=400, detail=f"Value Error: {e}")
|
| 181 |
+
|
| 182 |
+
except Exception as e:
|
| 183 |
+
traceback.print_exc() # Log the full traceback
|
| 184 |
+
raise HTTPException(status_code=500, detail=f"Internal Server Error: {str(e)}")
|
| 185 |
+
|
| 186 |
+
|
| 187 |
+
@app.post("/llm-response")
|
| 188 |
+
def get_response_llm(request: dict):
|
| 189 |
+
"""
|
| 190 |
+
Endpoint to process a query and get a response from the assistant.
|
| 191 |
+
"""
|
| 192 |
+
try:
|
| 193 |
+
# Initialize AgenticRAG and fetch context
|
| 194 |
+
|
| 195 |
+
print("Generate answer form gemini")
|
| 196 |
+
# Fetch response from gemini assistant
|
| 197 |
+
|
| 198 |
+
result = llm_answer(query=request["query"])
|
| 199 |
+
|
| 200 |
+
result = {
|
| 201 |
+
"response": result["response"],
|
| 202 |
+
}
|
| 203 |
+
# print(result)
|
| 204 |
+
|
| 205 |
+
return result
|
| 206 |
+
|
| 207 |
+
except ValidationError as e:
|
| 208 |
+
raise HTTPException(status_code=422, detail=f"Validation Error: {e}")
|
| 209 |
+
|
| 210 |
+
except ValueError as e:
|
| 211 |
+
raise HTTPException(status_code=400, detail=f"Value Error: {e}")
|
| 212 |
+
|
| 213 |
+
except Exception as e:
|
| 214 |
+
traceback.print_exc() # Log the full traceback
|
| 215 |
+
raise HTTPException(status_code=500, detail=f"Internal Server Error: {str(e)}")
|
| 216 |
+
|
| 217 |
+
|
| 218 |
+
@app.get("/health")
|
| 219 |
+
def health_check():
|
| 220 |
+
"""
|
| 221 |
+
Endpoint for health check.
|
| 222 |
+
"""
|
| 223 |
+
return {"status": "ok"}
|
| 224 |
+
|
| 225 |
+
|
| 226 |
+
@app.post("/delete-file")
|
| 227 |
+
async def process_upload_data(request: dict):
|
| 228 |
+
"""
|
| 229 |
+
Endpoint to retrieve do emedding of new file and store the result in Vector database.
|
| 230 |
+
"""
|
| 231 |
+
try:
|
| 232 |
+
db = VectorStore()
|
| 233 |
+
|
| 234 |
+
print("deletion started.")
|
| 235 |
+
db.delete_documents_by_filename(request["file_path"])
|
| 236 |
+
print("deletion end.")
|
| 237 |
+
|
| 238 |
+
return {"response": 200}
|
| 239 |
+
|
| 240 |
+
except ValidationError as e:
|
| 241 |
+
raise HTTPException(status_code=422, detail=f"Validation Error: {e}")
|
| 242 |
+
|
| 243 |
+
except ValueError as e:
|
| 244 |
+
raise HTTPException(status_code=400, detail=f"Value Error: {e}")
|
| 245 |
+
|
| 246 |
+
except Exception as e:
|
| 247 |
+
traceback.print_exc() # Log the full traceback
|
| 248 |
+
raise HTTPException(status_code=500, detail=f"Internal Server Error: {str(e)}")
|
| 249 |
+
|
| 250 |
+
|
| 251 |
+
@app.post("/process-file")
|
| 252 |
+
async def process_upload_data(request: dict):
|
| 253 |
+
"""
|
| 254 |
+
Endpoint to retrieve do emedding of new file and store the result in Vector database.
|
| 255 |
+
"""
|
| 256 |
+
try:
|
| 257 |
+
# Initialize AgenticRAG and fetch context
|
| 258 |
+
rag = AgenticRAG(is_uploaded=False)
|
| 259 |
+
|
| 260 |
+
print("process started.")
|
| 261 |
+
rag.process_file(request["file_path"])
|
| 262 |
+
print("process end.")
|
| 263 |
+
|
| 264 |
+
# rag = AgenticRAG(query_value=request.query)
|
| 265 |
+
# context = rag.query(query_text=request.query, n_results=10)
|
| 266 |
+
|
| 267 |
+
# # Fetch response from gemini assistant
|
| 268 |
+
# response = await gemini_rag_assistant.get_response(
|
| 269 |
+
# message=request.query, context_data=context
|
| 270 |
+
# )
|
| 271 |
+
|
| 272 |
+
# # Ensure response is in correct format
|
| 273 |
+
# if not isinstance(response, str):
|
| 274 |
+
# raise ValueError("Unexpected response format from gemini_rag_assistant.")
|
| 275 |
+
|
| 276 |
+
return {"response": 200}
|
| 277 |
+
|
| 278 |
+
except ValidationError as e:
|
| 279 |
+
raise HTTPException(status_code=422, detail=f"Validation Error: {e}")
|
| 280 |
+
|
| 281 |
+
except ValueError as e:
|
| 282 |
+
raise HTTPException(status_code=400, detail=f"Value Error: {e}")
|
| 283 |
+
|
| 284 |
+
except Exception as e:
|
| 285 |
+
traceback.print_exc() # Log the full traceback
|
| 286 |
+
raise HTTPException(status_code=500, detail=f"Internal Server Error: {str(e)}")
|
| 287 |
+
|
| 288 |
+
|
| 289 |
+
if __name__ == "__main__":
|
| 290 |
+
import uvicorn
|
| 291 |
+
|
| 292 |
+
uvicorn.run(app, host="0.0.0.0", port=8000)
|
src/run_app.py
ADDED
|
@@ -0,0 +1,45 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
import sys
|
| 3 |
+
|
| 4 |
+
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
|
| 5 |
+
import json
|
| 6 |
+
|
| 7 |
+
from pydantic import ValidationError
|
| 8 |
+
|
| 9 |
+
from src.assistants.assistant_v1 import gemini_rag_assistant
|
| 10 |
+
from src.utils.knowledge_base import AgenticRAG
|
| 11 |
+
|
| 12 |
+
|
| 13 |
+
def main():
|
| 14 |
+
rag = AgenticRAG(query_value=True)
|
| 15 |
+
while True:
|
| 16 |
+
query = input("Enter your query: ")
|
| 17 |
+
results = rag.query(query_text=query, n_results=10)
|
| 18 |
+
print("\nQuery Results:")
|
| 19 |
+
print(json.dumps(results, indent=2))
|
| 20 |
+
|
| 21 |
+
try:
|
| 22 |
+
print("gemini start generating answer")
|
| 23 |
+
response = gemini_rag_assistant.get_response_gemini(
|
| 24 |
+
message=query, context_data=results
|
| 25 |
+
)
|
| 26 |
+
print("\nAssistant Response:")
|
| 27 |
+
print(response)
|
| 28 |
+
|
| 29 |
+
except ValidationError as e:
|
| 30 |
+
print("Validation Error:", e)
|
| 31 |
+
return {
|
| 32 |
+
"source_id": "validation_error",
|
| 33 |
+
"content": str(e),
|
| 34 |
+
}
|
| 35 |
+
|
| 36 |
+
except Exception as e:
|
| 37 |
+
print("Internal Server Error:", e)
|
| 38 |
+
return {
|
| 39 |
+
"source_id": "internal_error",
|
| 40 |
+
"content": "Internal Server Error",
|
| 41 |
+
}
|
| 42 |
+
|
| 43 |
+
|
| 44 |
+
if __name__ == "__main__":
|
| 45 |
+
main()
|
src/utils/__init__.py
ADDED
|
File without changes
|
src/utils/download.py
ADDED
|
@@ -0,0 +1,126 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
import traceback
|
| 3 |
+
|
| 4 |
+
import requests
|
| 5 |
+
import yt_dlp
|
| 6 |
+
from bs4 import BeautifulSoup
|
| 7 |
+
from download_video import downlaod_video_from_url
|
| 8 |
+
from pytube import YouTube
|
| 9 |
+
|
| 10 |
+
|
| 11 |
+
def download_youtube_video(url, download_path="../data/"):
|
| 12 |
+
try:
|
| 13 |
+
yt = YouTube(url)
|
| 14 |
+
|
| 15 |
+
# Get the best stream (highest resolution video)
|
| 16 |
+
video_stream = (
|
| 17 |
+
yt.streams.filter(progressive=True, file_extension="mp4")
|
| 18 |
+
.order_by("resolution")
|
| 19 |
+
.desc()
|
| 20 |
+
.first()
|
| 21 |
+
)
|
| 22 |
+
|
| 23 |
+
# If the stream exists, download it
|
| 24 |
+
if video_stream:
|
| 25 |
+
video_stream.download(output_path=download_path)
|
| 26 |
+
print(f"Video downloaded successfully to {download_path}")
|
| 27 |
+
else:
|
| 28 |
+
print("No suitable video stream found")
|
| 29 |
+
except Exception as e:
|
| 30 |
+
print(f"Error in downloading YouTube video: {e}")
|
| 31 |
+
|
| 32 |
+
|
| 33 |
+
def download_audio(url, download_path="../data/"):
|
| 34 |
+
"""
|
| 35 |
+
Download audio from YouTube and convert to MP3 format.
|
| 36 |
+
|
| 37 |
+
Args:
|
| 38 |
+
url: YouTube video URL
|
| 39 |
+
download_path: Path where the MP3 file will be saved
|
| 40 |
+
"""
|
| 41 |
+
ydl_opts = {
|
| 42 |
+
"outtmpl": f"{download_path}%(title)s.%(ext)s",
|
| 43 |
+
"format": "bestaudio/best",
|
| 44 |
+
"geo-bypass": True,
|
| 45 |
+
"noplaylist": True,
|
| 46 |
+
"force-ipv4": True,
|
| 47 |
+
# Add postprocessors for MP3 conversion
|
| 48 |
+
"postprocessors": [
|
| 49 |
+
{
|
| 50 |
+
"key": "FFmpegExtractAudio",
|
| 51 |
+
"preferredcodec": "mp3",
|
| 52 |
+
"preferredquality": "192",
|
| 53 |
+
}
|
| 54 |
+
],
|
| 55 |
+
"headers": {
|
| 56 |
+
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
|
| 57 |
+
},
|
| 58 |
+
}
|
| 59 |
+
|
| 60 |
+
try:
|
| 61 |
+
with yt_dlp.YoutubeDL(ydl_opts) as ydl:
|
| 62 |
+
ydl.download([url])
|
| 63 |
+
print(f"Audio downloaded and converted to MP3 successfully at {download_path}")
|
| 64 |
+
except Exception as e:
|
| 65 |
+
print(f"An error occurred: {e}")
|
| 66 |
+
|
| 67 |
+
|
| 68 |
+
# Function to download PDF, DOC, or other files
|
| 69 |
+
def download_file(url, download_path="../data/"):
|
| 70 |
+
try:
|
| 71 |
+
response = requests.get(url, stream=True)
|
| 72 |
+
response.raise_for_status() # Check if the request was successful
|
| 73 |
+
filename = os.path.join(download_path, url.split("/")[-1])
|
| 74 |
+
|
| 75 |
+
with open(filename, "wb") as file:
|
| 76 |
+
for chunk in response.iter_content(chunk_size=1024):
|
| 77 |
+
if chunk:
|
| 78 |
+
file.write(chunk)
|
| 79 |
+
print(f"File downloaded successfully to {filename}")
|
| 80 |
+
except Exception as e:
|
| 81 |
+
print(f"An error occurred: {e}")
|
| 82 |
+
|
| 83 |
+
|
| 84 |
+
# Function to download text or webpage content
|
| 85 |
+
def download_text_or_webpage(url, download_path="../data/", is_text=False):
|
| 86 |
+
try:
|
| 87 |
+
response = requests.get(url)
|
| 88 |
+
response.raise_for_status()
|
| 89 |
+
|
| 90 |
+
if is_text:
|
| 91 |
+
filename = os.path.join(download_path, url.split("/")[-1] + ".txt")
|
| 92 |
+
with open(filename, "w") as file:
|
| 93 |
+
file.write(response.text)
|
| 94 |
+
print(f"Text file downloaded successfully to {filename}")
|
| 95 |
+
else:
|
| 96 |
+
soup = BeautifulSoup(response.text, "html.parser")
|
| 97 |
+
filename = os.path.join(download_path, url.split("/")[-1] + ".html")
|
| 98 |
+
with open(filename, "w", encoding="utf-8") as file:
|
| 99 |
+
file.write(soup.prettify())
|
| 100 |
+
print(f"Webpage downloaded successfully to {filename}")
|
| 101 |
+
|
| 102 |
+
except Exception as e:
|
| 103 |
+
print(f"An error occurred: {e}")
|
| 104 |
+
|
| 105 |
+
|
| 106 |
+
def main():
|
| 107 |
+
# Example Usage:
|
| 108 |
+
# url_video = "https://www.youtube.com/watch?v=dIYmzf21d1g"
|
| 109 |
+
# downlaod_video_from_url(
|
| 110 |
+
# youtube_url=url_video, download_path="../data/"
|
| 111 |
+
# ) # Download video
|
| 112 |
+
url_audio = "https://www.youtube.com/watch?v=8OHYynw7Yh4"
|
| 113 |
+
download_audio(url_audio) # Download audio
|
| 114 |
+
|
| 115 |
+
# url_pdf = "https://example.com/somefile.pdf"
|
| 116 |
+
# download_file(url_pdf) # Download PDF, DOC, or any other file
|
| 117 |
+
|
| 118 |
+
# url_text = "https://example.com/sometextfile"
|
| 119 |
+
# download_text_or_webpage(url_text, is_text=True) # Download text
|
| 120 |
+
|
| 121 |
+
# url_webpage = "https://en.wikipedia.org/wiki/Microsoft"
|
| 122 |
+
# download_text_or_webpage(url_webpage) # Download webpage content
|
| 123 |
+
|
| 124 |
+
|
| 125 |
+
if __name__ == "__main__":
|
| 126 |
+
main()
|
src/utils/download_video.py
ADDED
|
@@ -0,0 +1,170 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import logging
|
| 2 |
+
import os
|
| 3 |
+
import random
|
| 4 |
+
import time
|
| 5 |
+
from typing import Any, Dict, List, Optional
|
| 6 |
+
|
| 7 |
+
import requests
|
| 8 |
+
import yt_dlp
|
| 9 |
+
from bs4 import BeautifulSoup
|
| 10 |
+
from pytube import YouTube
|
| 11 |
+
|
| 12 |
+
# Configure logging
|
| 13 |
+
logging.basicConfig(
|
| 14 |
+
level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s"
|
| 15 |
+
)
|
| 16 |
+
logger = logging.getLogger(__name__)
|
| 17 |
+
|
| 18 |
+
|
| 19 |
+
class ContentDownloader:
|
| 20 |
+
def __init__(self, download_path: str = "./downloads/"):
|
| 21 |
+
self.download_path = download_path
|
| 22 |
+
self.create_download_directory()
|
| 23 |
+
|
| 24 |
+
def create_download_directory(self) -> None:
|
| 25 |
+
"""Create download directory if it doesn't exist."""
|
| 26 |
+
os.makedirs(self.download_path, exist_ok=True)
|
| 27 |
+
|
| 28 |
+
def _get_available_formats(self, url: str) -> List[Dict]:
|
| 29 |
+
"""Get list of available formats for a YouTube video."""
|
| 30 |
+
ydl_opts = {"quiet": True, "no_warnings": True, "extract_flat": True}
|
| 31 |
+
|
| 32 |
+
try:
|
| 33 |
+
with yt_dlp.YoutubeDL(ydl_opts) as ydl:
|
| 34 |
+
info = ydl.extract_info(url, download=False)
|
| 35 |
+
formats = info.get("formats", [])
|
| 36 |
+
# Filter for formats that have both video and audio
|
| 37 |
+
combined_formats = [
|
| 38 |
+
f
|
| 39 |
+
for f in formats
|
| 40 |
+
if f.get("vcodec") != "none" and f.get("acodec") != "none"
|
| 41 |
+
]
|
| 42 |
+
return combined_formats
|
| 43 |
+
except Exception as e:
|
| 44 |
+
logger.error(f"Error getting formats: {str(e)}")
|
| 45 |
+
return []
|
| 46 |
+
|
| 47 |
+
def download_youtube_content(
|
| 48 |
+
self, url: str, download_audio: bool = False
|
| 49 |
+
) -> Optional[str]:
|
| 50 |
+
"""
|
| 51 |
+
Download YouTube content with automatic format selection.
|
| 52 |
+
"""
|
| 53 |
+
if download_audio:
|
| 54 |
+
ydl_opts = {
|
| 55 |
+
"outtmpl": os.path.join(self.download_path, "%(title)s.%(ext)s"),
|
| 56 |
+
"format": "bestaudio/best",
|
| 57 |
+
"postprocessors": [
|
| 58 |
+
{
|
| 59 |
+
"key": "FFmpegExtractAudio",
|
| 60 |
+
"preferredcodec": "mp3",
|
| 61 |
+
}
|
| 62 |
+
],
|
| 63 |
+
}
|
| 64 |
+
else:
|
| 65 |
+
# Get available formats first
|
| 66 |
+
formats = self._get_available_formats(url)
|
| 67 |
+
if not formats:
|
| 68 |
+
logger.error("No suitable formats found")
|
| 69 |
+
return None
|
| 70 |
+
|
| 71 |
+
# Configure options for video download
|
| 72 |
+
ydl_opts = {
|
| 73 |
+
"outtmpl": os.path.join(self.download_path, "%(title)s.%(ext)s"),
|
| 74 |
+
"format": "bv*[ext=mp4]+ba[ext=m4a]/b[ext=mp4] / bv*+ba/b", # Prefer MP4 format
|
| 75 |
+
"merge_output_format": "mp4",
|
| 76 |
+
"postprocessors": [
|
| 77 |
+
{
|
| 78 |
+
"key": "FFmpegVideoRemuxer",
|
| 79 |
+
"preferedformat": "mp4",
|
| 80 |
+
}
|
| 81 |
+
],
|
| 82 |
+
"quiet": False,
|
| 83 |
+
"no_warnings": False,
|
| 84 |
+
"max_filesize": 2048 * 1024 * 1024, # 2GB max
|
| 85 |
+
"geo_bypass": True,
|
| 86 |
+
"nocheckcertificate": True,
|
| 87 |
+
"http_headers": {
|
| 88 |
+
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
|
| 89 |
+
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
|
| 90 |
+
"Accept-Language": "en-us,en;q=0.5",
|
| 91 |
+
"Sec-Fetch-Mode": "navigate",
|
| 92 |
+
},
|
| 93 |
+
}
|
| 94 |
+
|
| 95 |
+
try:
|
| 96 |
+
# First update yt-dlp
|
| 97 |
+
os.system("yt-dlp -U")
|
| 98 |
+
|
| 99 |
+
# Attempt download with yt-dlp
|
| 100 |
+
with yt_dlp.YoutubeDL(ydl_opts) as ydl:
|
| 101 |
+
logger.info("Attempting download with yt-dlp...")
|
| 102 |
+
ydl.download([url])
|
| 103 |
+
return self.download_path
|
| 104 |
+
|
| 105 |
+
except Exception as e:
|
| 106 |
+
logger.warning(f"yt-dlp download failed: {str(e)}")
|
| 107 |
+
logger.info("Attempting fallback to direct stream download...")
|
| 108 |
+
return self._download_with_direct_stream(url)
|
| 109 |
+
|
| 110 |
+
def _download_with_direct_stream(
|
| 111 |
+
self, url: str, max_retries: int = 3
|
| 112 |
+
) -> Optional[str]:
|
| 113 |
+
"""Alternative download method using direct stream access."""
|
| 114 |
+
for attempt in range(max_retries):
|
| 115 |
+
try:
|
| 116 |
+
if attempt > 0:
|
| 117 |
+
time.sleep(random.uniform(2, 5))
|
| 118 |
+
|
| 119 |
+
yt = YouTube(url)
|
| 120 |
+
# Sort streams by both resolution and bitrate
|
| 121 |
+
streams = yt.streams.filter(progressive=True, file_extension="mp4")
|
| 122 |
+
stream = streams.order_by("resolution").desc().first()
|
| 123 |
+
|
| 124 |
+
if stream:
|
| 125 |
+
# Add random query parameter to avoid caching
|
| 126 |
+
timestamp = int(time.time())
|
| 127 |
+
stream.url = f"{stream.url}&_={timestamp}"
|
| 128 |
+
|
| 129 |
+
file_path = stream.download(
|
| 130 |
+
output_path=self.download_path,
|
| 131 |
+
filename_prefix=f"video_{timestamp}_",
|
| 132 |
+
)
|
| 133 |
+
logger.info(f"Successfully downloaded to: {file_path}")
|
| 134 |
+
return file_path
|
| 135 |
+
else:
|
| 136 |
+
logger.error("No suitable stream found")
|
| 137 |
+
return None
|
| 138 |
+
|
| 139 |
+
except Exception as e:
|
| 140 |
+
logger.error(f"Download attempt {attempt + 1} failed: {str(e)}")
|
| 141 |
+
if attempt == max_retries - 1:
|
| 142 |
+
logger.error("All download attempts failed")
|
| 143 |
+
return None
|
| 144 |
+
|
| 145 |
+
|
| 146 |
+
def downlaod_video_from_url(youtube_url="", download_path="./downloads/"):
|
| 147 |
+
# Update yt-dlp first
|
| 148 |
+
os.system("yt-dlp -U")
|
| 149 |
+
|
| 150 |
+
downloader = ContentDownloader(download_path=download_path)
|
| 151 |
+
|
| 152 |
+
|
| 153 |
+
|
| 154 |
+
# First, check available formats
|
| 155 |
+
formats = downloader._get_available_formats(youtube_url)
|
| 156 |
+
if formats:
|
| 157 |
+
print("\nAvailable formats:")
|
| 158 |
+
for f in formats:
|
| 159 |
+
print(
|
| 160 |
+
f"Format ID: {f.get('format_id')} - "
|
| 161 |
+
f"Resolution: {f.get('resolution')} - "
|
| 162 |
+
f"Filesize: {f.get('filesize_approx', 'unknown')} bytes"
|
| 163 |
+
)
|
| 164 |
+
|
| 165 |
+
# Download video with audio
|
| 166 |
+
video_path = downloader.download_youtube_content(youtube_url)
|
| 167 |
+
if video_path:
|
| 168 |
+
print(f"\nVideo downloaded to: {video_path}")
|
| 169 |
+
else:
|
| 170 |
+
print("\nDownload failed")
|
src/utils/json_file_record.json
ADDED
|
@@ -0,0 +1,14 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[
|
| 2 |
+
{
|
| 3 |
+
"id": "esnXVYfJ",
|
| 4 |
+
"file_path": "data\\What Makes Hanuman Chalisa So POWERFUL - Tantric Explains TRS Clips.mp3"
|
| 5 |
+
},
|
| 6 |
+
{
|
| 7 |
+
"id": "iVeOlEJl",
|
| 8 |
+
"file_path": "data\\The Science Of Building Extreme Discipline - Andrew Huberman.mp3"
|
| 9 |
+
},
|
| 10 |
+
{
|
| 11 |
+
"id": "nrrAW71c",
|
| 12 |
+
"file_path": "data\\attenstion_is_all_you_need.pdf"
|
| 13 |
+
}
|
| 14 |
+
]
|
src/utils/knowledge_base.py
ADDED
|
@@ -0,0 +1,523 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
import sys
|
| 3 |
+
|
| 4 |
+
# Add the project root to Python path
|
| 5 |
+
project_root = os.path.abspath(os.path.join(os.path.dirname(__file__), "../.."))
|
| 6 |
+
sys.path.append(project_root)
|
| 7 |
+
|
| 8 |
+
import json
|
| 9 |
+
import os
|
| 10 |
+
import re
|
| 11 |
+
import time
|
| 12 |
+
from typing import Any, Dict, List, Optional
|
| 13 |
+
from urllib.parse import urlparse
|
| 14 |
+
|
| 15 |
+
import docx
|
| 16 |
+
import exceptiongroup
|
| 17 |
+
import google.generativeai as genai
|
| 18 |
+
import numpy as np
|
| 19 |
+
import pandas as pd
|
| 20 |
+
import PyPDF2
|
| 21 |
+
import requests
|
| 22 |
+
from bs4 import BeautifulSoup
|
| 23 |
+
from dotenv import load_dotenv
|
| 24 |
+
from nanoid import generate
|
| 25 |
+
from pydantic import BaseModel, Field
|
| 26 |
+
|
| 27 |
+
from src.utils.vectorDB import VectorStore
|
| 28 |
+
|
| 29 |
+
load_dotenv()
|
| 30 |
+
|
| 31 |
+
|
| 32 |
+
class MetadataPDF(BaseModel):
|
| 33 |
+
key_concepts: List[str] = Field(
|
| 34 |
+
..., description="Key concepts related to the topic"
|
| 35 |
+
)
|
| 36 |
+
page_number: int = Field(
|
| 37 |
+
...,
|
| 38 |
+
alias="page-number",
|
| 39 |
+
description="The page number where this content is located",
|
| 40 |
+
)
|
| 41 |
+
|
| 42 |
+
|
| 43 |
+
class SegmentPDF(BaseModel):
|
| 44 |
+
content: str = Field(..., description="The main text of the segment")
|
| 45 |
+
metadata: MetadataPDF
|
| 46 |
+
|
| 47 |
+
|
| 48 |
+
class AnalyzedContentPDF(BaseModel):
|
| 49 |
+
segments: List[SegmentPDF] = Field(
|
| 50 |
+
..., description="List of meaningful content segments"
|
| 51 |
+
)
|
| 52 |
+
|
| 53 |
+
|
| 54 |
+
# for text data
|
| 55 |
+
|
| 56 |
+
|
| 57 |
+
class MetadataTxt(BaseModel):
|
| 58 |
+
key_concepts: List[str] = Field(
|
| 59 |
+
..., description="Key concepts related to the topic"
|
| 60 |
+
)
|
| 61 |
+
|
| 62 |
+
|
| 63 |
+
class SegmentTxt(BaseModel):
|
| 64 |
+
content: str = Field(..., description="The main text of the segment")
|
| 65 |
+
metadata: MetadataTxt
|
| 66 |
+
|
| 67 |
+
|
| 68 |
+
class AnalyzedContentTxt(BaseModel):
|
| 69 |
+
segments: List[SegmentTxt] = Field(
|
| 70 |
+
..., description="List of meaningful content segments"
|
| 71 |
+
)
|
| 72 |
+
|
| 73 |
+
|
| 74 |
+
class GeminiChunker:
|
| 75 |
+
def __init__(self):
|
| 76 |
+
self.api_key = os.getenv("GEMINI_API_KEY")
|
| 77 |
+
genai.configure(api_key=self.api_key)
|
| 78 |
+
# self.model = genai.GenerativeModel(model_name="gemini-2.0-flash-exp")
|
| 79 |
+
self.model = genai.GenerativeModel(model_name="gemini-1.5-flash")
|
| 80 |
+
# self.model = genai.GenerativeModel(model_name="gemini-1.5-pro")
|
| 81 |
+
|
| 82 |
+
def check_file_ready(self, file):
|
| 83 |
+
while file.state.name == "PROCESSING":
|
| 84 |
+
print(".", end="")
|
| 85 |
+
time.sleep(10)
|
| 86 |
+
file = genai.get_file(file.name)
|
| 87 |
+
|
| 88 |
+
if file.state.name == "FAILED":
|
| 89 |
+
raise ValueError(f"File processing failed: {file.state.name}")
|
| 90 |
+
|
| 91 |
+
def chunk_with_gemini(
|
| 92 |
+
self, content: str, content_type: str
|
| 93 |
+
) -> List[Dict[str, Any]]:
|
| 94 |
+
# cleaned_content = "".join(
|
| 95 |
+
# char for char in content if ord(char) >= 32 or char in "\n\r\t"
|
| 96 |
+
# )
|
| 97 |
+
|
| 98 |
+
# # Encode and decode to handle unsupported characters
|
| 99 |
+
# safe_string = cleaned_content.encode("utf-8", errors="replace").decode("utf-8")
|
| 100 |
+
# print(safe_string)
|
| 101 |
+
|
| 102 |
+
if content_type == "pdf" or content_type == "docx":
|
| 103 |
+
"""Use Gemini to intelligently chunk content based on semantic understanding"""
|
| 104 |
+
prompt = f"""
|
| 105 |
+
Analyze the following {content_type} content first means read whole content first then after divide it into complete and meaningful segments (chunks).
|
| 106 |
+
Each chunk size has 512 token should:
|
| 107 |
+
1. Be self-contained and end at logical boundaries (e.g., complete sentences or sections).
|
| 108 |
+
2. Include all text that belongs to a single segment without truncation.
|
| 109 |
+
3. Ensure the last chunk is fully complete and not cut off.
|
| 110 |
+
|
| 111 |
+
Return the response strictly in the specified schema format:
|
| 112 |
+
|
| 113 |
+
{{
|
| 114 |
+
"content": "segment text here",
|
| 115 |
+
"metadata": {{
|
| 116 |
+
"key_concepts": ["concept1", "concept2"],
|
| 117 |
+
"page-number": 64
|
| 118 |
+
}}
|
| 119 |
+
}},
|
| 120 |
+
// more segments...
|
| 121 |
+
|
| 122 |
+
Content to analyze:
|
| 123 |
+
{content}
|
| 124 |
+
|
| 125 |
+
Keep the response as pure JSON without any additional text or explanation. Avoid splitting content mid-sentence or mid-thought.
|
| 126 |
+
All chunks should be complete.
|
| 127 |
+
"""
|
| 128 |
+
schema = AnalyzedContentPDF
|
| 129 |
+
else:
|
| 130 |
+
prompt = f"""
|
| 131 |
+
Analyze the following {content_type} content first means read whole content first then after divide it into complete and meaningful segments (chunks).
|
| 132 |
+
Each chunk should:
|
| 133 |
+
1. Be self-contained and end at logical boundaries (e.g., complete sentences or sections).
|
| 134 |
+
2. Include all text that belongs to a single segment without truncation.
|
| 135 |
+
3. Ensure the last chunk is fully complete and not cut off.
|
| 136 |
+
|
| 137 |
+
Return the response strictly in the specified schema format:
|
| 138 |
+
|
| 139 |
+
{{
|
| 140 |
+
"content": "segment text here",
|
| 141 |
+
"metadata": {{
|
| 142 |
+
"key_concepts": ["concept1", "concept2"],
|
| 143 |
+
"page-number": NA
|
| 144 |
+
}}
|
| 145 |
+
}},
|
| 146 |
+
// more segments...
|
| 147 |
+
|
| 148 |
+
Content to analyze:
|
| 149 |
+
{content}
|
| 150 |
+
|
| 151 |
+
Keep the response as pure JSON without any additional text or explanation. Avoid splitting content mid-sentence or mid-thought.
|
| 152 |
+
All chunks should be complete.
|
| 153 |
+
"""
|
| 154 |
+
schema = AnalyzedContentTxt
|
| 155 |
+
|
| 156 |
+
print(schema)
|
| 157 |
+
|
| 158 |
+
try:
|
| 159 |
+
response = self.model.generate_content(
|
| 160 |
+
prompt,
|
| 161 |
+
generation_config=genai.GenerationConfig(
|
| 162 |
+
response_mime_type="application/json",
|
| 163 |
+
response_schema=schema,
|
| 164 |
+
),
|
| 165 |
+
)
|
| 166 |
+
# print(response.text)
|
| 167 |
+
cleaned_text = "".join(
|
| 168 |
+
char for char in response.text if ord(char) >= 32 or char in "\n\r\t"
|
| 169 |
+
)
|
| 170 |
+
with open("chunking_text.txt", "w", encoding="utf-8") as file_text:
|
| 171 |
+
print("^^^^^^^^^^^^^^^^^^^^^^^^^^^^^")
|
| 172 |
+
file_text.write(cleaned_text)
|
| 173 |
+
print("^^^^^^^^^^^^^^^^^^^^^^^^^^^^^")
|
| 174 |
+
|
| 175 |
+
cleaned_text = cleaned_text.encode("utf-8", errors="replace").decode(
|
| 176 |
+
"utf-8"
|
| 177 |
+
)
|
| 178 |
+
|
| 179 |
+
result = json.loads(cleaned_text)
|
| 180 |
+
|
| 181 |
+
# print(result)
|
| 182 |
+
|
| 183 |
+
print("pdf parsing")
|
| 184 |
+
|
| 185 |
+
chunks = []
|
| 186 |
+
|
| 187 |
+
for segment in result.get("segments", []):
|
| 188 |
+
# print("################################")
|
| 189 |
+
# print(segment.get("content", ""))
|
| 190 |
+
# print("################################")
|
| 191 |
+
temp_metadata = segment["metadata"]
|
| 192 |
+
if content_type == "pdf":
|
| 193 |
+
chunk = {
|
| 194 |
+
"content": segment.get("content", ""),
|
| 195 |
+
"metadata": {
|
| 196 |
+
"topics": temp_metadata.get("key_concepts", []),
|
| 197 |
+
"page-number": temp_metadata.get("page-number", ""),
|
| 198 |
+
"type": "pdf",
|
| 199 |
+
},
|
| 200 |
+
}
|
| 201 |
+
else:
|
| 202 |
+
chunk = {
|
| 203 |
+
"content": segment.get("content", ""),
|
| 204 |
+
"metadata": {
|
| 205 |
+
"topics": segment.get("key_concepts", []),
|
| 206 |
+
"page-number": segment.get("page-number", ""),
|
| 207 |
+
"type": "text",
|
| 208 |
+
},
|
| 209 |
+
}
|
| 210 |
+
chunks.append(chunk)
|
| 211 |
+
|
| 212 |
+
# print()
|
| 213 |
+
# print()
|
| 214 |
+
# print("chunks:")
|
| 215 |
+
# print(chunks)
|
| 216 |
+
# print()
|
| 217 |
+
# print()
|
| 218 |
+
|
| 219 |
+
return chunks
|
| 220 |
+
|
| 221 |
+
except Exception as e:
|
| 222 |
+
print(f"Error in Gemini chunking: {e}")
|
| 223 |
+
return [
|
| 224 |
+
{
|
| 225 |
+
"content": content,
|
| 226 |
+
"metadata": {
|
| 227 |
+
"topics": "",
|
| 228 |
+
"page-number": "0",
|
| 229 |
+
},
|
| 230 |
+
}
|
| 231 |
+
]
|
| 232 |
+
|
| 233 |
+
def process_media_file(
|
| 234 |
+
self, file_path: str, media_type: str
|
| 235 |
+
) -> List[Dict[str, Any]]:
|
| 236 |
+
"""Process video or audio file using Gemini's media understanding"""
|
| 237 |
+
try:
|
| 238 |
+
print("here-0!!!")
|
| 239 |
+
media_file = genai.upload_file(path=file_path)
|
| 240 |
+
self.check_file_ready(media_file)
|
| 241 |
+
print("here-1!!!")
|
| 242 |
+
|
| 243 |
+
if media_type == "video":
|
| 244 |
+
schema = {
|
| 245 |
+
"type": "object",
|
| 246 |
+
"properties": {
|
| 247 |
+
"segments": {
|
| 248 |
+
"type": "array",
|
| 249 |
+
"items": {
|
| 250 |
+
"type": "object",
|
| 251 |
+
"properties": {
|
| 252 |
+
"timestamp": {"type": "string"},
|
| 253 |
+
"description": {"type": "string"},
|
| 254 |
+
"topics": {
|
| 255 |
+
"type": "array",
|
| 256 |
+
"items": {"type": "string"},
|
| 257 |
+
},
|
| 258 |
+
},
|
| 259 |
+
},
|
| 260 |
+
}
|
| 261 |
+
},
|
| 262 |
+
}
|
| 263 |
+
|
| 264 |
+
prompt = "Describe this video in detail, breaking it into timestamped segments. Include key events and actions."
|
| 265 |
+
else: # audio
|
| 266 |
+
schema = {
|
| 267 |
+
"type": "object",
|
| 268 |
+
"properties": {
|
| 269 |
+
"segments": {
|
| 270 |
+
"type": "array",
|
| 271 |
+
"items": {
|
| 272 |
+
"type": "object",
|
| 273 |
+
"properties": {
|
| 274 |
+
"timestamp": {"type": "string"},
|
| 275 |
+
"transcription": {"type": "string"},
|
| 276 |
+
"speaker": {"type": "string"},
|
| 277 |
+
"topics": {
|
| 278 |
+
"type": "array",
|
| 279 |
+
"items": {"type": "string"},
|
| 280 |
+
},
|
| 281 |
+
},
|
| 282 |
+
},
|
| 283 |
+
}
|
| 284 |
+
},
|
| 285 |
+
}
|
| 286 |
+
|
| 287 |
+
prompt = "Transcribe this audio, identifying speakers and key topics discussed."
|
| 288 |
+
|
| 289 |
+
print("Here-2!!!")
|
| 290 |
+
response = self.model.generate_content(
|
| 291 |
+
[media_file, prompt],
|
| 292 |
+
generation_config=genai.GenerationConfig(
|
| 293 |
+
response_schema=schema, response_mime_type="application/json"
|
| 294 |
+
),
|
| 295 |
+
)
|
| 296 |
+
print("Here-3!!!")
|
| 297 |
+
|
| 298 |
+
# Convert Gemini's media response to our standard chunk format
|
| 299 |
+
print(response.text)
|
| 300 |
+
print("Here-4!!!")
|
| 301 |
+
cleaned_text = "".join(
|
| 302 |
+
char for char in response.text if ord(char) >= 32 or char in "\n\r\t"
|
| 303 |
+
)
|
| 304 |
+
result = json.loads(cleaned_text)
|
| 305 |
+
# result = json.loads(response.text)
|
| 306 |
+
chunks = []
|
| 307 |
+
print("Here-5!!!")
|
| 308 |
+
|
| 309 |
+
for segment in result.get("segments", []):
|
| 310 |
+
if media_type == "video":
|
| 311 |
+
chunk = {
|
| 312 |
+
"content": segment.get("description", ""),
|
| 313 |
+
"metadata": {
|
| 314 |
+
"timestamp": segment.get("timestamp", ""),
|
| 315 |
+
"topics": segment.get("key_events", []),
|
| 316 |
+
"type": "video",
|
| 317 |
+
},
|
| 318 |
+
}
|
| 319 |
+
else:
|
| 320 |
+
chunk = {
|
| 321 |
+
"content": segment.get("transcription", ""),
|
| 322 |
+
"metadata": {
|
| 323 |
+
"timestamp": segment.get("timestamp", ""),
|
| 324 |
+
"speaker": segment.get("speaker", ""),
|
| 325 |
+
"topics": segment.get("topics", []),
|
| 326 |
+
"type": "audio",
|
| 327 |
+
},
|
| 328 |
+
}
|
| 329 |
+
chunks.append(chunk)
|
| 330 |
+
print("Here-6!!!")
|
| 331 |
+
|
| 332 |
+
return chunks
|
| 333 |
+
|
| 334 |
+
except Exception as e:
|
| 335 |
+
print(f"Error processing {media_type} file: {e}")
|
| 336 |
+
return [
|
| 337 |
+
{
|
| 338 |
+
"content": f"Error processing {media_type} file",
|
| 339 |
+
"metadata": {"type": media_type, "error": str(e)},
|
| 340 |
+
}
|
| 341 |
+
]
|
| 342 |
+
|
| 343 |
+
|
| 344 |
+
class ContentProcessor:
|
| 345 |
+
def __init__(self):
|
| 346 |
+
self.gemini_chunker = GeminiChunker()
|
| 347 |
+
|
| 348 |
+
def process_text(self, text: str, source_type: str) -> List[Dict[str, Any]]:
|
| 349 |
+
"""Process any text content using Gemini chunking"""
|
| 350 |
+
chunks = self.gemini_chunker.chunk_with_gemini(text, source_type)
|
| 351 |
+
for chunk in chunks:
|
| 352 |
+
chunk["metadata"]["source_type"] = source_type
|
| 353 |
+
return chunks
|
| 354 |
+
|
| 355 |
+
def process_pdf(self, file_path: str) -> List[Dict[str, Any]]:
|
| 356 |
+
with open(file_path, "rb") as file:
|
| 357 |
+
pdf_reader = PyPDF2.PdfReader(file)
|
| 358 |
+
full_text = ""
|
| 359 |
+
for page in pdf_reader.pages:
|
| 360 |
+
full_text += page.extract_text() + " "
|
| 361 |
+
return self.process_text(full_text, "pdf")
|
| 362 |
+
|
| 363 |
+
def process_docx(self, file_path: str) -> List[Dict[str, Any]]:
|
| 364 |
+
doc = docx.Document(file_path)
|
| 365 |
+
full_text = " ".join([paragraph.text for paragraph in doc.paragraphs])
|
| 366 |
+
return self.process_text(full_text, "docx")
|
| 367 |
+
|
| 368 |
+
def process_csv(self, file_path: str) -> List[Dict[str, Any]]:
|
| 369 |
+
df = pd.read_csv(file_path)
|
| 370 |
+
# Convert DataFrame to a more readable format for Gemini
|
| 371 |
+
text_content = df.to_string()
|
| 372 |
+
return self.process_text(text_content, "csv")
|
| 373 |
+
|
| 374 |
+
def process_webpage(self, url: str) -> List[Dict[str, Any]]:
|
| 375 |
+
response = requests.get(url)
|
| 376 |
+
soup = BeautifulSoup(response.text, "html.parser")
|
| 377 |
+
for script in soup(["script", "style"]):
|
| 378 |
+
script.decompose()
|
| 379 |
+
text = soup.get_text()
|
| 380 |
+
return self.process_text(text, "webpage")
|
| 381 |
+
|
| 382 |
+
def process_video(self, file_path: str) -> List[Dict[str, Any]]:
|
| 383 |
+
"""Process video using Gemini's video understanding capabilities"""
|
| 384 |
+
print("in process function of video")
|
| 385 |
+
video_file = genai.upload_file(path=file_path)
|
| 386 |
+
self.gemini_chunker.check_file_ready(video_file)
|
| 387 |
+
|
| 388 |
+
chunks = self.gemini_chunker.process_media_file(
|
| 389 |
+
file_path=file_path, media_type="video"
|
| 390 |
+
)
|
| 391 |
+
return chunks
|
| 392 |
+
|
| 393 |
+
def process_audio(self, file_path: str) -> List[Dict[str, Any]]:
|
| 394 |
+
"""Process audio using Gemini's audio understanding capabilities"""
|
| 395 |
+
print("in process function of audio")
|
| 396 |
+
audio_file = genai.upload_file(path=file_path)
|
| 397 |
+
self.gemini_chunker.check_file_ready(audio_file)
|
| 398 |
+
|
| 399 |
+
chunks = self.gemini_chunker.process_media_file(
|
| 400 |
+
file_path=file_path, media_type="audio"
|
| 401 |
+
)
|
| 402 |
+
return chunks
|
| 403 |
+
|
| 404 |
+
|
| 405 |
+
class AgenticRAG:
|
| 406 |
+
def __init__(self, query_value=False, is_uploaded=False):
|
| 407 |
+
self.processor = ContentProcessor()
|
| 408 |
+
self.vector_store = VectorStore(query=query_value, is_uploaded=is_uploaded)
|
| 409 |
+
if query_value == False and is_uploaded == True:
|
| 410 |
+
self.json_file_path = "json_file_record.json"
|
| 411 |
+
else:
|
| 412 |
+
self.json_file_path = "utils/json_file_record.json"
|
| 413 |
+
|
| 414 |
+
def process_file(self, file_path: str, file_type: Optional[str] = None):
|
| 415 |
+
if file_type is None:
|
| 416 |
+
file_type = self._detect_file_type(file_path)
|
| 417 |
+
|
| 418 |
+
if os.path.exists(self.json_file_path):
|
| 419 |
+
with open(self.json_file_path, "r") as json_file:
|
| 420 |
+
json_data = json.load(json_file)
|
| 421 |
+
for record in json_data:
|
| 422 |
+
if record["file_path"] == file_path:
|
| 423 |
+
return True # File path exists
|
| 424 |
+
try:
|
| 425 |
+
chunks = []
|
| 426 |
+
if file_type == "pdf":
|
| 427 |
+
chunks = self.processor.process_pdf(file_path)
|
| 428 |
+
elif file_type == "docx":
|
| 429 |
+
chunks = self.processor.process_docx(file_path)
|
| 430 |
+
elif file_type == "csv":
|
| 431 |
+
chunks = self.processor.process_csv(file_path)
|
| 432 |
+
elif file_type == "url":
|
| 433 |
+
chunks = self.processor.process_webpage(file_path)
|
| 434 |
+
elif file_type == "video":
|
| 435 |
+
chunks = self.processor.process_video(file_path)
|
| 436 |
+
elif file_type == "audio":
|
| 437 |
+
chunks = self.processor.process_audio(file_path)
|
| 438 |
+
elif file_type == "text":
|
| 439 |
+
with open(file_path, "r") as file:
|
| 440 |
+
chunks = self.processor.process_text(file.read(), "text")
|
| 441 |
+
|
| 442 |
+
if chunks:
|
| 443 |
+
# Add source information to metadata
|
| 444 |
+
print("in processfile fucntion file.")
|
| 445 |
+
for chunk in chunks:
|
| 446 |
+
chunk["metadata"]["source"] = file_path
|
| 447 |
+
|
| 448 |
+
print(chunks)
|
| 449 |
+
|
| 450 |
+
# Add to Vector Database.
|
| 451 |
+
self.vector_store.add_documents(chunks)
|
| 452 |
+
print(f"Successfully processed {file_path} with {len(chunks)} chunks")
|
| 453 |
+
|
| 454 |
+
return True
|
| 455 |
+
return False
|
| 456 |
+
|
| 457 |
+
except Exception as e:
|
| 458 |
+
print(f"Error processing {file_path}: {e}")
|
| 459 |
+
|
| 460 |
+
def _detect_file_type(self, file_path: str) -> str:
|
| 461 |
+
if file_path.startswith("http"):
|
| 462 |
+
return "url"
|
| 463 |
+
|
| 464 |
+
extension = file_path.split(".")[-1].lower()
|
| 465 |
+
type_mapping = {
|
| 466 |
+
"pdf": "pdf",
|
| 467 |
+
"docx": "docx",
|
| 468 |
+
"doc": "docx",
|
| 469 |
+
"csv": "csv",
|
| 470 |
+
"txt": "text",
|
| 471 |
+
"mp3": "audio",
|
| 472 |
+
"wav": "audio",
|
| 473 |
+
"mp4": "video",
|
| 474 |
+
"mov": "video",
|
| 475 |
+
}
|
| 476 |
+
return type_mapping.get(extension, "unknown")
|
| 477 |
+
|
| 478 |
+
def query(self, query_text: str, n_results: int = 5) -> Dict:
|
| 479 |
+
return self.vector_store.query(query_text, n_results)
|
| 480 |
+
|
| 481 |
+
|
| 482 |
+
# Define a function to determine the file type based on the extension
|
| 483 |
+
def get_file_type(file_name: str) -> str:
|
| 484 |
+
if file_name.endswith(".mp3"):
|
| 485 |
+
return "audio"
|
| 486 |
+
elif file_name.endswith(".mp4"):
|
| 487 |
+
return "video"
|
| 488 |
+
elif file_name.endswith(".csv"):
|
| 489 |
+
return "csv"
|
| 490 |
+
elif file_name.endswith(".pdf"):
|
| 491 |
+
return "pdf"
|
| 492 |
+
elif file_name.endswith(".docx"):
|
| 493 |
+
return "docx"
|
| 494 |
+
elif file_name.startswith("http"):
|
| 495 |
+
return "url"
|
| 496 |
+
else:
|
| 497 |
+
return "unknown"
|
| 498 |
+
|
| 499 |
+
|
| 500 |
+
def main():
|
| 501 |
+
# Initialize the RAG system
|
| 502 |
+
rag = AgenticRAG(is_uploaded=True)
|
| 503 |
+
|
| 504 |
+
# Automatically read the files in the 'data' directory
|
| 505 |
+
data_directory = "../data"
|
| 506 |
+
test_files = []
|
| 507 |
+
|
| 508 |
+
# Loop through all files in the 'data' directory
|
| 509 |
+
for filename in os.listdir(data_directory):
|
| 510 |
+
file_path = os.path.join(data_directory, filename)
|
| 511 |
+
if os.path.isfile(file_path): # Check if it's a file
|
| 512 |
+
file_type = get_file_type(filename)
|
| 513 |
+
test_files.append((file_path, file_type))
|
| 514 |
+
|
| 515 |
+
# Process each file
|
| 516 |
+
for file_path, file_type in test_files:
|
| 517 |
+
print(f"\nProcessing {file_path}...")
|
| 518 |
+
time.sleep(5)
|
| 519 |
+
rag.process_file(file_path, file_type)
|
| 520 |
+
|
| 521 |
+
|
| 522 |
+
if __name__ == "__main__":
|
| 523 |
+
main()
|
src/utils/vectorDB.py
ADDED
|
@@ -0,0 +1,291 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import json
|
| 2 |
+
import os
|
| 3 |
+
import sys
|
| 4 |
+
from typing import Any, Dict, List
|
| 5 |
+
|
| 6 |
+
# Add the project root to Python path
|
| 7 |
+
project_root = os.path.abspath(os.path.join(os.path.dirname(__file__), ".."))
|
| 8 |
+
sys.path.append(project_root)
|
| 9 |
+
|
| 10 |
+
import chromadb
|
| 11 |
+
import numpy as np
|
| 12 |
+
import torch
|
| 13 |
+
from nanoid import generate
|
| 14 |
+
from transformers import AutoModel, AutoTokenizer
|
| 15 |
+
|
| 16 |
+
|
| 17 |
+
def append_to_json(new_entries, filename="json_file_record.json"):
|
| 18 |
+
"""
|
| 19 |
+
Append new entries to an existing JSON array file, or create a new one if it doesn't exist.
|
| 20 |
+
|
| 21 |
+
Args:
|
| 22 |
+
new_entries (list): List of dictionaries to append
|
| 23 |
+
filename (str): Name of the JSON file
|
| 24 |
+
"""
|
| 25 |
+
try:
|
| 26 |
+
# Read existing data if file exists
|
| 27 |
+
if os.path.exists(filename) and os.path.getsize(filename) > 0:
|
| 28 |
+
with open(filename, "r") as f:
|
| 29 |
+
data = json.load(f)
|
| 30 |
+
if not isinstance(data, list):
|
| 31 |
+
data = []
|
| 32 |
+
else:
|
| 33 |
+
data = []
|
| 34 |
+
|
| 35 |
+
# # Append new entries
|
| 36 |
+
# data.extend(new_entries)
|
| 37 |
+
# print(data)
|
| 38 |
+
|
| 39 |
+
# Write back the updated data
|
| 40 |
+
with open(filename, "w") as f:
|
| 41 |
+
json.dump(data, f, indent=4)
|
| 42 |
+
|
| 43 |
+
except json.JSONDecodeError:
|
| 44 |
+
# Handle case where file exists but is not valid JSON
|
| 45 |
+
data = new_entries
|
| 46 |
+
with open(filename, "w") as f:
|
| 47 |
+
json.dump(data, f, indent=4)
|
| 48 |
+
|
| 49 |
+
|
| 50 |
+
class BERTEmbedder:
|
| 51 |
+
def __init__(self):
|
| 52 |
+
self.tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
|
| 53 |
+
self.model = AutoModel.from_pretrained("bert-base-uncased")
|
| 54 |
+
self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
| 55 |
+
self.model.to(self.device)
|
| 56 |
+
|
| 57 |
+
def get_embeddings(self, texts: List[str]) -> np.ndarray:
|
| 58 |
+
embeddings = []
|
| 59 |
+
self.model.eval()
|
| 60 |
+
with torch.no_grad():
|
| 61 |
+
for text in texts:
|
| 62 |
+
inputs = self.tokenizer(
|
| 63 |
+
text,
|
| 64 |
+
padding=True,
|
| 65 |
+
truncation=True,
|
| 66 |
+
max_length=512,
|
| 67 |
+
return_tensors="pt",
|
| 68 |
+
)
|
| 69 |
+
inputs = {k: v.to(self.device) for k, v in inputs.items()}
|
| 70 |
+
outputs = self.model(**inputs)
|
| 71 |
+
embeddings.append(outputs.last_hidden_state.mean(dim=1).cpu().numpy())
|
| 72 |
+
return np.vstack(embeddings)
|
| 73 |
+
|
| 74 |
+
|
| 75 |
+
class VectorStore:
|
| 76 |
+
def __init__(
|
| 77 |
+
self, persist_directory: str = "../chroma_rag", query=False, is_uploaded=False
|
| 78 |
+
):
|
| 79 |
+
try:
|
| 80 |
+
if query == False and is_uploaded == True:
|
| 81 |
+
print("Embbeding store mode.")
|
| 82 |
+
print(f"Initializing ChromaDB with directory: {persist_directory}")
|
| 83 |
+
self.client = chromadb.PersistentClient(path=persist_directory)
|
| 84 |
+
print("ChromaDB client created successfully")
|
| 85 |
+
|
| 86 |
+
self.collection = self.client.get_or_create_collection(
|
| 87 |
+
name="documents",
|
| 88 |
+
metadata={"hnsw:space": "cosine"},
|
| 89 |
+
embedding_function=None, # We're using our own embeddings
|
| 90 |
+
)
|
| 91 |
+
print(f"Collection 'documents' ready")
|
| 92 |
+
|
| 93 |
+
self.embedder = BERTEmbedder()
|
| 94 |
+
print("BERT embedder initialized")
|
| 95 |
+
|
| 96 |
+
# Check if collection has documents
|
| 97 |
+
content = self.collection.get()
|
| 98 |
+
print(f"Collection contains {len(content['documents'])} documents")
|
| 99 |
+
self.json_file_path = "json_file_record.json"
|
| 100 |
+
|
| 101 |
+
else:
|
| 102 |
+
print("query mode")
|
| 103 |
+
persist_directory = "chroma_rag"
|
| 104 |
+
print(f"Initializing ChromaDB with directory: {persist_directory}")
|
| 105 |
+
self.client = chromadb.PersistentClient(path=persist_directory)
|
| 106 |
+
print("ChromaDB client created successfully")
|
| 107 |
+
|
| 108 |
+
self.collection = self.client.get_or_create_collection(
|
| 109 |
+
name="documents",
|
| 110 |
+
metadata={"hnsw:space": "cosine"},
|
| 111 |
+
embedding_function=None, # We're using our own embeddings
|
| 112 |
+
)
|
| 113 |
+
print(f"Collection 'documents' ready")
|
| 114 |
+
|
| 115 |
+
self.embedder = BERTEmbedder()
|
| 116 |
+
print("BERT embedder initialized")
|
| 117 |
+
|
| 118 |
+
# Check if collection has documents
|
| 119 |
+
content = self.collection.get()
|
| 120 |
+
print(f"Collection contains {len(content['documents'])} documents")
|
| 121 |
+
self.json_file_path = "utils/json_file_record.json"
|
| 122 |
+
|
| 123 |
+
except Exception as e:
|
| 124 |
+
print(f"Error initializing VectorStore: {e}")
|
| 125 |
+
raise
|
| 126 |
+
|
| 127 |
+
def is_collection_empty(self) -> bool:
|
| 128 |
+
try:
|
| 129 |
+
content = self.collection.get()
|
| 130 |
+
return len(content["documents"]) == 0
|
| 131 |
+
except Exception as e:
|
| 132 |
+
print(f"Error checking collection: {e}")
|
| 133 |
+
return True
|
| 134 |
+
|
| 135 |
+
def add_documents(self, chunks: List[Dict[str, Any]]):
|
| 136 |
+
try:
|
| 137 |
+
texts = [chunk["content"] for chunk in chunks]
|
| 138 |
+
metadatas = [chunk["metadata"] for chunk in chunks]
|
| 139 |
+
|
| 140 |
+
print(f"Generating embeddings for {len(texts)} documents...")
|
| 141 |
+
print(texts)
|
| 142 |
+
embeddings = self.embedder.get_embeddings(texts)
|
| 143 |
+
|
| 144 |
+
id_val = str(generate(size=8))
|
| 145 |
+
print(f"Generated ID: {id_val}")
|
| 146 |
+
|
| 147 |
+
if os.path.exists(self.json_file_path):
|
| 148 |
+
|
| 149 |
+
with open(self.json_file_path, "r") as f:
|
| 150 |
+
data = json.load(f)
|
| 151 |
+
for chunk in chunks:
|
| 152 |
+
temp = {"id": id_val, "file_path": chunk["metadata"]["source"]}
|
| 153 |
+
|
| 154 |
+
break
|
| 155 |
+
# Append the new entry
|
| 156 |
+
data.append(temp)
|
| 157 |
+
|
| 158 |
+
# Write the updated JSON data back to the file
|
| 159 |
+
with open(self.json_file_path, "w") as file:
|
| 160 |
+
json.dump(data, file, indent=4)
|
| 161 |
+
else:
|
| 162 |
+
# Usage in your code would be:
|
| 163 |
+
with open(self.json_file_path, "w") as f:
|
| 164 |
+
temp = []
|
| 165 |
+
for chunk in chunks:
|
| 166 |
+
temp.append(
|
| 167 |
+
{"id": id_val, "file_path": chunk["metadata"]["source"]}
|
| 168 |
+
)
|
| 169 |
+
break
|
| 170 |
+
# Write the updated JSON data back to the file
|
| 171 |
+
with open(self.json_file_path, "w") as file:
|
| 172 |
+
json.dump(temp, file, indent=4)
|
| 173 |
+
|
| 174 |
+
print("*************")
|
| 175 |
+
count = 0
|
| 176 |
+
ids = []
|
| 177 |
+
# Clean metadata
|
| 178 |
+
for metadata in metadatas:
|
| 179 |
+
metadata["topics"] = str(metadata["topics"])
|
| 180 |
+
ids.append(f"{id_val}{count}")
|
| 181 |
+
count += 1
|
| 182 |
+
print(metadatas)
|
| 183 |
+
print("------------------------")
|
| 184 |
+
print(len(metadatas))
|
| 185 |
+
|
| 186 |
+
print(f"Adding {len(texts)} documents to collection...")
|
| 187 |
+
self.collection.add(
|
| 188 |
+
embeddings=embeddings.tolist(),
|
| 189 |
+
documents=texts,
|
| 190 |
+
metadatas=metadatas,
|
| 191 |
+
ids=ids,
|
| 192 |
+
)
|
| 193 |
+
|
| 194 |
+
# Verify addition
|
| 195 |
+
collection_content = self.collection.get()
|
| 196 |
+
print(
|
| 197 |
+
f"Collection now contains {len(collection_content['documents'])} documents"
|
| 198 |
+
)
|
| 199 |
+
|
| 200 |
+
return True
|
| 201 |
+
except Exception as e:
|
| 202 |
+
print(f"Error adding documents: {e}")
|
| 203 |
+
return False
|
| 204 |
+
|
| 205 |
+
def query(self, query_text: str, n_results: int = 3) -> Dict:
|
| 206 |
+
try:
|
| 207 |
+
|
| 208 |
+
print(f"Generating embedding for query: {query_text}")
|
| 209 |
+
query_embedding = self.embedder.get_embeddings([query_text])
|
| 210 |
+
|
| 211 |
+
print("Checking collection content:")
|
| 212 |
+
collection_content = self.collection.get()
|
| 213 |
+
print(
|
| 214 |
+
f"Number of documents in collection: {len(collection_content['documents'])}"
|
| 215 |
+
)
|
| 216 |
+
|
| 217 |
+
print("Executing query...")
|
| 218 |
+
query_vector = query_embedding.tolist()
|
| 219 |
+
results = self.collection.query(
|
| 220 |
+
n_results=min(n_results, len(collection_content["documents"])),
|
| 221 |
+
query_embeddings=query_vector,
|
| 222 |
+
)
|
| 223 |
+
|
| 224 |
+
# query_texts=[query_text],
|
| 225 |
+
|
| 226 |
+
print(f"Query results: {json.dumps(results, indent=2)}")
|
| 227 |
+
return results
|
| 228 |
+
except Exception as e:
|
| 229 |
+
print(f"Error during query: {e}")
|
| 230 |
+
return {"error": str(e)}
|
| 231 |
+
|
| 232 |
+
def delete_documents_by_filename(self, file_substring: str):
|
| 233 |
+
"""
|
| 234 |
+
Delete documents from the collection and JSON file by matching a substring in the file path.
|
| 235 |
+
|
| 236 |
+
Args:
|
| 237 |
+
file_substring (str): Substring to match in the file paths.
|
| 238 |
+
json_file (str): Path to the JSON file containing document metadata.
|
| 239 |
+
"""
|
| 240 |
+
try:
|
| 241 |
+
# Load JSON data
|
| 242 |
+
print(file_substring)
|
| 243 |
+
json_file = self.json_file_path
|
| 244 |
+
if not os.path.exists(json_file):
|
| 245 |
+
print(f"JSON file {json_file} does not exist.")
|
| 246 |
+
return
|
| 247 |
+
|
| 248 |
+
with open(json_file, "r") as f:
|
| 249 |
+
data = json.load(f)
|
| 250 |
+
|
| 251 |
+
# Find matching records
|
| 252 |
+
matching_records = [
|
| 253 |
+
record for record in data if file_substring in record["file_path"]
|
| 254 |
+
]
|
| 255 |
+
if not matching_records:
|
| 256 |
+
print(f"No records found matching substring: {file_substring}")
|
| 257 |
+
return
|
| 258 |
+
|
| 259 |
+
# print("record", record)
|
| 260 |
+
|
| 261 |
+
# Get IDs of matching records
|
| 262 |
+
matching_ids = [record["id"] for record in matching_records]
|
| 263 |
+
print("maching_ids", matching_ids[0])
|
| 264 |
+
|
| 265 |
+
# Remove matching records from JSON file
|
| 266 |
+
updated_data = [
|
| 267 |
+
record for record in data if record["id"] not in matching_ids
|
| 268 |
+
]
|
| 269 |
+
|
| 270 |
+
print("updated data", updated_data)
|
| 271 |
+
|
| 272 |
+
with open(json_file, "w") as f:
|
| 273 |
+
json.dump(updated_data, f, indent=4)
|
| 274 |
+
|
| 275 |
+
print(f"Deleted {len(matching_records)} records from JSON file.")
|
| 276 |
+
# Retrieve all IDs in the collection
|
| 277 |
+
all_ids = self.collection.get()["ids"]
|
| 278 |
+
|
| 279 |
+
# Filter IDs that contain the substring "LDtz9CG5"
|
| 280 |
+
ids_to_delete = [id_ for id_ in all_ids if matching_ids[0] in id_]
|
| 281 |
+
|
| 282 |
+
# Delete those IDs from the collection
|
| 283 |
+
if ids_to_delete:
|
| 284 |
+
self.collection.delete(ids=ids_to_delete)
|
| 285 |
+
print(
|
| 286 |
+
f"Deleted {len(ids_to_delete)} records with IDs containing 'LDtz9CG5'."
|
| 287 |
+
)
|
| 288 |
+
else:
|
| 289 |
+
print("No matching IDs found.")
|
| 290 |
+
except Exception as e:
|
| 291 |
+
print(f"Error deleting documents: {e}")
|