duythduong commited on
Commit
d45043b
·
1 Parent(s): 72bf066

feat: use firebase for documents, add readme

Browse files
README.md CHANGED
@@ -0,0 +1,81 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # RAG
2
+
3
+ This project provides a web interface to upload PDF documents and ask questions about their content. It uses a FastAPI backend to process the documents and provide summarized responses, and a Streamlit frontend to create an interactive user interface.
4
+
5
+ ## Table of Contents
6
+
7
+ - [Installation](#installation)
8
+ - [Usage](#usage)
9
+ - [Contributing](#contributing)
10
+ - [License](#license)
11
+
12
+ ## Installation
13
+
14
+ ### Prerequisites
15
+
16
+ - Python 3.7+
17
+ - `pip` (Python package installer)
18
+
19
+ ### Steps
20
+
21
+ 1. Clone the repository:
22
+
23
+ ```bash
24
+ git clone https://github.com/dtduy77/RAG.git
25
+ cd RAG
26
+ ```
27
+
28
+ 2. Create and activate a virtual environment:
29
+
30
+ ```bash
31
+ python -m venv env
32
+ source env/bin/activate # On Windows use `env\Scripts\activate
33
+ ```
34
+
35
+ 3. Install the required dependencies:
36
+
37
+ ```bash
38
+ pip install -r requirements.txt
39
+ ```
40
+
41
+ 4. Run the FastAPI server:
42
+
43
+ ```bash
44
+ uvicorn main:app --reload
45
+ ```
46
+
47
+ 5. Run the Streamlit app in another terminal:
48
+
49
+ ```bash
50
+ streamlit run app.py
51
+ ```
52
+
53
+ ## Usage
54
+
55
+ 1. Open your web browser and go to http://localhost:8501.
56
+
57
+ 2. Use the file uploader to select a PDF document.
58
+
59
+ 3. Enter a question related to the content of the document.
60
+
61
+ 4. Click on "Get Summary" to receive a summarized response.
62
+
63
+ ## Contributing
64
+
65
+ 1. Fork the repository.
66
+
67
+ 2. Create a new branch (git checkout -b feature/your-feature).
68
+
69
+ 3. Make your changes.
70
+
71
+ 4. Commit your changes (git commit -m 'Add your message').
72
+
73
+ 5. Push to the branch (git push origin feature/your-feature).
74
+
75
+ 6. Open a Pull Request.
76
+
77
+ ## License
78
+
79
+ This project is open-source.
80
+
81
+ Feel free to contribute and enhance the functionality!
agent.py DELETED
@@ -1,16 +0,0 @@
1
- from langchain_openai import ChatOpenAI
2
- from langchain import hub
3
- from langchain.agents import create_tool_calling_agent
4
- from langchain.agents import AgentExecutor
5
-
6
- llm = ChatOpenAI(model="gpt-3.5-turbo-0125", temperature=0)
7
-
8
- # Get the prompt to use - you can modify this!
9
- prompt = hub.pull("hwchase17/openai-functions-agent")
10
- # prompt.messages
11
-
12
- agent = create_tool_calling_agent(llm, tools, prompt)
13
-
14
-
15
-
16
- agent_executor = AgentExecutor(agent=agent, tools=tools, verbose=True)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
apis/v1/configs/firebase_config.py ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import firebase_admin
2
+ from firebase_admin import credentials, firestore
3
+ import os
4
+ from dotenv import load_dotenv
5
+ load_dotenv()
6
+
7
+ # get credentials from .env
8
+ credential_firebase = {
9
+ "type": os.getenv("TYPE"),
10
+ "project_id": os.getenv("PROJECT_ID"),
11
+ "private_key_id": os.getenv("PRIVATE_KEY_ID"),
12
+ "private_key": os.getenv("PRIVATE_KEY").replace('\\n', '\n'),
13
+ "client_email": os.getenv("CLIENT_EMAIL"),
14
+ "client_id": os.getenv("CLIENT_ID"),
15
+ "auth_uri": os.getenv("AUTH_URI"),
16
+ "token_uri": os.getenv("TOKEN_URI"),
17
+ "auth_provider_x509_cert_url": os.getenv("AUTH_PROVIDER_X509_CERT_URL"),
18
+ "client_x509_cert_url": os.getenv("CLIENT_X509_CERT_URL"),
19
+ "universe_domain": os.getenv("UNIVERSE_DOMAIN")
20
+ }
21
+
22
+ # check if firebase is not initialized
23
+ if not firebase_admin._apps:
24
+ # Initialize the app with a service account, granting admin privileges
25
+ cred = credentials.Certificate(credential_firebase)
26
+ app = firebase_admin.initialize_app(cred)
27
+
28
+ # Initialize Firestore
29
+ db = firestore.client()
30
+ print("Database connected")
apis/v1/configs/{llm_configs.py → llm_config.py} RENAMED
File without changes
apis/v1/controllers/document_controller.py CHANGED
@@ -2,42 +2,46 @@ from typing import AnyStr
2
  from fastapi import UploadFile, HTTPException, status, BackgroundTasks
3
  import uuid
4
  import time
 
5
  from ..schemas.document_schema import DocSchema
6
 
7
- # def get_all_docs():
8
- # '''
9
- # Get all the documents from the database.
10
- # '''
11
- # return Document.objects.all()
12
-
13
- # def process_doc(file_path: AnyStr):
14
- # '''
15
- # Process a document.
16
- # '''
17
- # return splits
18
-
19
- # def _upload_docs(filename: AnyStr, doc: DocSchema):
20
- # '''
21
- # Get content type of file.
22
- # '''
23
- # # Get content type of file
24
- # content_type = get_content_type(filename)
25
- # path, url = storage_db.upload(data, filename, content_type)
26
- # cv.update_path_url(path, url)
27
-
28
- # return
29
-
30
- # def update_docs():
31
- # '''
32
- # Update a document in the database.
33
- # '''
34
- # return Document.objects.update()
35
-
36
- # def delete_docs():
37
- # '''
38
- # Delete a document from the database.
39
- # '''
40
- # return Document.objects.delete()
41
-
42
-
43
- # async def upload_doc()
 
 
 
 
2
  from fastapi import UploadFile, HTTPException, status, BackgroundTasks
3
  import uuid
4
  import time
5
+ from ..providers import firebase_db
6
  from ..schemas.document_schema import DocSchema
7
 
8
+ def upload_document(data):
9
+ """
10
+ Upload a document
11
+ """
12
+ try:
13
+ upload_document = firebase_db.upload_doc(data)
14
+ return upload_document
15
+ except Exception as e:
16
+ raise HTTPException(status_code=status.HTTP_500_INTERNAL_SERVER_ERROR, detail=str(e))
17
+
18
+
19
+ def get_document(document_id: AnyStr):
20
+ """
21
+ Get a document
22
+ """
23
+ try:
24
+ document = firebase_db.get_doc(document_id)
25
+ return document
26
+ except Exception as e:
27
+ raise HTTPException(status_code=status.HTTP_500_INTERNAL_SERVER_ERROR, detail=str(e))
28
+
29
+ def update_document(document_id: AnyStr, data):
30
+ """
31
+ Update a document
32
+ """
33
+ try:
34
+ update_document = firebase_db.update_doc(document_id, data)
35
+ return update_document
36
+ except Exception as e:
37
+ raise HTTPException(status_code=status.HTTP_500_INTERNAL_SERVER_ERROR, detail=str(e))
38
+
39
+ def delete_document(document_id: AnyStr):
40
+ """
41
+ Delete a document
42
+ """
43
+ try:
44
+ delete_document = firebase_db.delete_doc(document_id)
45
+ return delete_document
46
+ except Exception as e:
47
+ raise HTTPException(status_code=status.HTTP_500_INTERNAL_SERVER_ERROR, detail=str(e))
apis/v1/controllers/rag_controller.py CHANGED
@@ -4,7 +4,7 @@ from langchain_core.runnables import RunnablePassthrough
4
  from langchain_text_splitters import RecursiveCharacterTextSplitter
5
  from langchain_core.prompts import PromptTemplate
6
  from langchain_pinecone import PineconeVectorStore
7
- from ..configs.llm_configs import gemini_model as llm
8
  from ..configs.word_embedding_config import mxbai_embedder
9
  from ..controllers.vectorstore_controller import create_vector_store
10
  from ..utils.prompts import rag_prompt
 
4
  from langchain_text_splitters import RecursiveCharacterTextSplitter
5
  from langchain_core.prompts import PromptTemplate
6
  from langchain_pinecone import PineconeVectorStore
7
+ from ..configs.llm_config import gemini_model as llm
8
  from ..configs.word_embedding_config import mxbai_embedder
9
  from ..controllers.vectorstore_controller import create_vector_store
10
  from ..utils.prompts import rag_prompt
apis/v1/providers/__init__.py CHANGED
@@ -1,3 +1,5 @@
1
  from .vectorstore_provider import VectorStoreProvider
 
2
 
3
- vectorstore_db = VectorStoreProvider()
 
 
1
  from .vectorstore_provider import VectorStoreProvider
2
+ from .firebase_provider import FirebaseProvider
3
 
4
+ vectorstore_db = VectorStoreProvider()
5
+ firebase_db = FirebaseProvider()
apis/v1/providers/firebase_provider.py ADDED
@@ -0,0 +1,87 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from ..configs.firebase_config import db
2
+
3
+ class FirebaseProvider:
4
+ def __init__(self):
5
+ self.collection_name = "Documents"
6
+ self.db = db
7
+
8
+ def upload_doc(self, data):
9
+ """
10
+ Uploads a document to Firestore.
11
+
12
+ :param collection_name: Name of the Firestore collection
13
+ :param data: Dictionary containing the document data
14
+ :return: document is successfully uploaded, error otherwise
15
+ """
16
+ try:
17
+ self.db.collection(self.collection_name).add(data)
18
+ return f"Document uploaded successfully to collection {self.collection_name}."
19
+ except Exception as e:
20
+ return (f"An error occurred: {e}")
21
+
22
+ def get_doc(self, document_id):
23
+ """
24
+ Retrieves a document from Firestore by collection name and document ID.
25
+
26
+ :param collection_name: Name of the Firestore collection
27
+ :param document_id: ID of the Firestore document
28
+ :return: Dictionary containing the document data or None if document is not found
29
+ """
30
+ try:
31
+ doc_ref = self.db.collection(self.collection_name).document(document_id)
32
+ doc = doc_ref.get()
33
+ if doc.exists:
34
+ print(f"Document with ID {document_id} retrieved successfully from collection {self.collection_name}.")
35
+ return doc.to_dict()
36
+ else:
37
+ print(f"No document found with ID {document_id} in collection {self.collection_name}.")
38
+ return None
39
+ except Exception as e:
40
+ print(f"An error occurred: {e}")
41
+ return None
42
+
43
+ def delete_doc(self, document_id):
44
+ """
45
+ Deletes a document from Firestore by collection name and document ID.
46
+
47
+ :param collection_name: Name of the Firestore collection
48
+ :param document_id: ID of the Firestore document
49
+ :return: document is successfully deleted, False otherwise
50
+ """
51
+ try:
52
+ doc_ref = self.db.collection(self.collection_name).document(document_id)
53
+ doc = doc_ref.get()
54
+ if doc.exists:
55
+ doc_ref.delete()
56
+ return f"Document with ID {document_id} deleted successfully from collection {self.collection_name}."
57
+ else:
58
+ print(f"No document found with ID {document_id} in collection {self.collection_name}.")
59
+ return False
60
+ except Exception as e:
61
+ print(f"An error occurred: {e}")
62
+ return False
63
+
64
+ def update_doc(self, document_id, data):
65
+ """
66
+ Updates a document in Firestore by collection name and document ID.
67
+
68
+ :param collection_name: Name of the Firestore collection
69
+ :param document_id: ID of the Firestore document
70
+ :param data: Dictionary containing the updated document data
71
+ :return: document is successfully updated, error otherwise
72
+ """
73
+
74
+ try:
75
+ doc_ref = self.db.collection(self.collection_name).document(document_id)
76
+ doc = doc_ref.get()
77
+ if doc.exists:
78
+ doc_ref.update(data)
79
+ return f"Document with ID {document_id} updated successfully in collection {self.collection_name}."
80
+ else:
81
+ print(f"No document found with ID {document_id} in collection {self.collection_name}.")
82
+ return False
83
+ except Exception as e:
84
+ print(f"An error occurred: {e}")
85
+ return False
86
+
87
+
apis/v1/routes/documents.py CHANGED
@@ -2,40 +2,41 @@ from typing import Annotated
2
  from io import BytesIO
3
  from pydantic import BaseModel, Field
4
  from fastapi import APIRouter, Depends, BackgroundTasks
 
5
  from ..interfaces.document_interface import DocumentUploadResponseInterface
6
  from ..utils.response_fmt import jsonResponseFmt
7
 
8
  router = APIRouter(prefix="/documents", tags=["Documents"])
9
 
10
 
11
- # @router.get("/{document_id}", response_model=DocumentResponse)
12
- # async def get_document(document_id: str):
13
- # """
14
- # Get a document
15
- # """
16
-
17
- # return {"document_id": document_id}
18
 
19
  @router.post("/upload", response_model=DocumentUploadResponseInterface)
20
- async def upload_document():
21
  """
22
  Upload a document
23
  """
24
-
25
- return jsonResponseFmt(None,"Document uploaded successfully")
26
 
27
- # @router.update("/{document_id}", response_model=DocumentResponse)
28
- # async def update_document(document_id: str):
29
- # """
30
- # Update a document
31
- # """
32
-
33
- # return {"document_id": document_id}
34
 
35
- # @router.delete("/{document_id}")
36
- # async def delete_document(document_id: str):
37
- # """
38
- # Delete a document
39
- # """
40
-
41
- # return {"document_id": document_id}
 
2
  from io import BytesIO
3
  from pydantic import BaseModel, Field
4
  from fastapi import APIRouter, Depends, BackgroundTasks
5
+ from ..controllers.document_controller import upload_document, get_document, update_document, delete_document
6
  from ..interfaces.document_interface import DocumentUploadResponseInterface
7
  from ..utils.response_fmt import jsonResponseFmt
8
 
9
  router = APIRouter(prefix="/documents", tags=["Documents"])
10
 
11
 
12
+ @router.get("/{document_id}", response_model=DocumentUploadResponseInterface)
13
+ async def get_doc(document_id: str):
14
+ """
15
+ Get a document
16
+ """
17
+ document = get_document(document_id)
18
+ return jsonResponseFmt(document,"Document retrieved successfully")
19
 
20
  @router.post("/upload", response_model=DocumentUploadResponseInterface)
21
+ async def upload_doc(data: dict):
22
  """
23
  Upload a document
24
  """
25
+ document = upload_document(data)
26
+ return jsonResponseFmt(document,"Document uploaded successfully")
27
 
28
+ @router.put("/{document_id}", response_model=DocumentUploadResponseInterface)
29
+ async def update_doc(document_id: str, data: dict):
30
+ """
31
+ Update a document
32
+ """
33
+ document = update_document(document_id, data)
34
+ return jsonResponseFmt(document,"Document updated successfully")
35
 
36
+ @router.delete("/{document_id}", response_model=DocumentUploadResponseInterface)
37
+ async def delete_doc(document_id: str):
38
+ """
39
+ Delete a document
40
+ """
41
+ document = delete_document(document_id)
42
+ return jsonResponseFmt(document,"Document deleted successfully")
app.py CHANGED
@@ -30,3 +30,5 @@ if st.button("Get Summary"):
30
  st.write(response.json())
31
  else:
32
  st.warning("Please upload a file and enter a question.")
 
 
 
30
  st.write(response.json())
31
  else:
32
  st.warning("Please upload a file and enter a question.")
33
+
34
+
test.py DELETED
@@ -1,54 +0,0 @@
1
- from langchain_google_genai import ChatGoogleGenerativeAI
2
- from dotenv import load_dotenv
3
- import os
4
- from langchain_community.document_loaders import PyPDFLoader
5
- from langchain import hub
6
- from langchain_chroma import Chroma
7
- from langchain_community.document_loaders import WebBaseLoader
8
- from langchain_core.output_parsers import StrOutputParser
9
- from langchain_core.runnables import RunnablePassthrough
10
- from langchain_openai import OpenAIEmbeddings
11
- from langchain_text_splitters import RecursiveCharacterTextSplitter
12
- from langchain_huggingface import HuggingFaceEmbeddings
13
- from langchain_core.prompts import PromptTemplate
14
- from apis.v1.utils.prompts import rag_prompt
15
-
16
- load_dotenv()
17
-
18
- mxbai_embedder = HuggingFaceEmbeddings(model_name="mixedbread-ai/mxbai-embed-large-v1")
19
-
20
-
21
- llm = ChatGoogleGenerativeAI(google_api_key=os.environ.get("GOOGLE_API_KEY"),
22
- model="gemini-1.5-pro-latest")
23
-
24
- # Load and split the PDF document into pages
25
- pdf_loader = PyPDFLoader("14014749.pdf")
26
- # print(pdf_loader)
27
- pages = pdf_loader.load_and_split()
28
- # print(pages)
29
- # Split the pages into smaller chunks
30
- text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
31
- splits = text_splitter.split_documents(pages)
32
- print(splits)
33
- # Create a vector store from the document splits
34
- vectorstore = Chroma.from_documents(documents=splits, embedding=mxbai_embedder)
35
-
36
- # Retrieve and generate using the relevant snippets of the blog
37
- retriever = vectorstore.as_retriever()
38
- print(retriever)
39
- custom_rag_prompt = PromptTemplate.from_template(rag_prompt)
40
- print(custom_rag_prompt)
41
- def format_docs(docs):
42
- return "\n\n".join(doc.page_content for doc in docs)
43
-
44
- # Define the RAG chain
45
- rag_chain = (
46
- {"context": retriever | format_docs, "question": RunnablePassthrough()}
47
- | custom_rag_prompt
48
- | llm
49
- | StrOutputParser()
50
- )
51
-
52
- # Invoke the RAG chain with a question
53
- response = rag_chain.invoke("Can you summarize the document?")
54
- print(response)