Spaces:
Sleeping
Sleeping
Commit ·
d45043b
1
Parent(s): 72bf066
feat: use firebase for documents, add readme
Browse files- README.md +81 -0
- agent.py +0 -16
- apis/v1/configs/firebase_config.py +30 -0
- apis/v1/configs/{llm_configs.py → llm_config.py} +0 -0
- apis/v1/controllers/document_controller.py +41 -37
- apis/v1/controllers/rag_controller.py +1 -1
- apis/v1/providers/__init__.py +3 -1
- apis/v1/providers/firebase_provider.py +87 -0
- apis/v1/routes/documents.py +25 -24
- app.py +2 -0
- test.py +0 -54
README.md
CHANGED
|
@@ -0,0 +1,81 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# RAG
|
| 2 |
+
|
| 3 |
+
This project provides a web interface to upload PDF documents and ask questions about their content. It uses a FastAPI backend to process the documents and provide summarized responses, and a Streamlit frontend to create an interactive user interface.
|
| 4 |
+
|
| 5 |
+
## Table of Contents
|
| 6 |
+
|
| 7 |
+
- [Installation](#installation)
|
| 8 |
+
- [Usage](#usage)
|
| 9 |
+
- [Contributing](#contributing)
|
| 10 |
+
- [License](#license)
|
| 11 |
+
|
| 12 |
+
## Installation
|
| 13 |
+
|
| 14 |
+
### Prerequisites
|
| 15 |
+
|
| 16 |
+
- Python 3.7+
|
| 17 |
+
- `pip` (Python package installer)
|
| 18 |
+
|
| 19 |
+
### Steps
|
| 20 |
+
|
| 21 |
+
1. Clone the repository:
|
| 22 |
+
|
| 23 |
+
```bash
|
| 24 |
+
git clone https://github.com/dtduy77/RAG.git
|
| 25 |
+
cd RAG
|
| 26 |
+
```
|
| 27 |
+
|
| 28 |
+
2. Create and activate a virtual environment:
|
| 29 |
+
|
| 30 |
+
```bash
|
| 31 |
+
python -m venv env
|
| 32 |
+
source env/bin/activate # On Windows use `env\Scripts\activate
|
| 33 |
+
```
|
| 34 |
+
|
| 35 |
+
3. Install the required dependencies:
|
| 36 |
+
|
| 37 |
+
```bash
|
| 38 |
+
pip install -r requirements.txt
|
| 39 |
+
```
|
| 40 |
+
|
| 41 |
+
4. Run the FastAPI server:
|
| 42 |
+
|
| 43 |
+
```bash
|
| 44 |
+
uvicorn main:app --reload
|
| 45 |
+
```
|
| 46 |
+
|
| 47 |
+
5. Run the Streamlit app in another terminal:
|
| 48 |
+
|
| 49 |
+
```bash
|
| 50 |
+
streamlit run app.py
|
| 51 |
+
```
|
| 52 |
+
|
| 53 |
+
## Usage
|
| 54 |
+
|
| 55 |
+
1. Open your web browser and go to http://localhost:8501.
|
| 56 |
+
|
| 57 |
+
2. Use the file uploader to select a PDF document.
|
| 58 |
+
|
| 59 |
+
3. Enter a question related to the content of the document.
|
| 60 |
+
|
| 61 |
+
4. Click on "Get Summary" to receive a summarized response.
|
| 62 |
+
|
| 63 |
+
## Contributing
|
| 64 |
+
|
| 65 |
+
1. Fork the repository.
|
| 66 |
+
|
| 67 |
+
2. Create a new branch (git checkout -b feature/your-feature).
|
| 68 |
+
|
| 69 |
+
3. Make your changes.
|
| 70 |
+
|
| 71 |
+
4. Commit your changes (git commit -m 'Add your message').
|
| 72 |
+
|
| 73 |
+
5. Push to the branch (git push origin feature/your-feature).
|
| 74 |
+
|
| 75 |
+
6. Open a Pull Request.
|
| 76 |
+
|
| 77 |
+
## License
|
| 78 |
+
|
| 79 |
+
This project is open-source.
|
| 80 |
+
|
| 81 |
+
Feel free to contribute and enhance the functionality!
|
agent.py
DELETED
|
@@ -1,16 +0,0 @@
|
|
| 1 |
-
from langchain_openai import ChatOpenAI
|
| 2 |
-
from langchain import hub
|
| 3 |
-
from langchain.agents import create_tool_calling_agent
|
| 4 |
-
from langchain.agents import AgentExecutor
|
| 5 |
-
|
| 6 |
-
llm = ChatOpenAI(model="gpt-3.5-turbo-0125", temperature=0)
|
| 7 |
-
|
| 8 |
-
# Get the prompt to use - you can modify this!
|
| 9 |
-
prompt = hub.pull("hwchase17/openai-functions-agent")
|
| 10 |
-
# prompt.messages
|
| 11 |
-
|
| 12 |
-
agent = create_tool_calling_agent(llm, tools, prompt)
|
| 13 |
-
|
| 14 |
-
|
| 15 |
-
|
| 16 |
-
agent_executor = AgentExecutor(agent=agent, tools=tools, verbose=True)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
apis/v1/configs/firebase_config.py
ADDED
|
@@ -0,0 +1,30 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import firebase_admin
|
| 2 |
+
from firebase_admin import credentials, firestore
|
| 3 |
+
import os
|
| 4 |
+
from dotenv import load_dotenv
|
| 5 |
+
load_dotenv()
|
| 6 |
+
|
| 7 |
+
# get credentials from .env
|
| 8 |
+
credential_firebase = {
|
| 9 |
+
"type": os.getenv("TYPE"),
|
| 10 |
+
"project_id": os.getenv("PROJECT_ID"),
|
| 11 |
+
"private_key_id": os.getenv("PRIVATE_KEY_ID"),
|
| 12 |
+
"private_key": os.getenv("PRIVATE_KEY").replace('\\n', '\n'),
|
| 13 |
+
"client_email": os.getenv("CLIENT_EMAIL"),
|
| 14 |
+
"client_id": os.getenv("CLIENT_ID"),
|
| 15 |
+
"auth_uri": os.getenv("AUTH_URI"),
|
| 16 |
+
"token_uri": os.getenv("TOKEN_URI"),
|
| 17 |
+
"auth_provider_x509_cert_url": os.getenv("AUTH_PROVIDER_X509_CERT_URL"),
|
| 18 |
+
"client_x509_cert_url": os.getenv("CLIENT_X509_CERT_URL"),
|
| 19 |
+
"universe_domain": os.getenv("UNIVERSE_DOMAIN")
|
| 20 |
+
}
|
| 21 |
+
|
| 22 |
+
# check if firebase is not initialized
|
| 23 |
+
if not firebase_admin._apps:
|
| 24 |
+
# Initialize the app with a service account, granting admin privileges
|
| 25 |
+
cred = credentials.Certificate(credential_firebase)
|
| 26 |
+
app = firebase_admin.initialize_app(cred)
|
| 27 |
+
|
| 28 |
+
# Initialize Firestore
|
| 29 |
+
db = firestore.client()
|
| 30 |
+
print("Database connected")
|
apis/v1/configs/{llm_configs.py → llm_config.py}
RENAMED
|
File without changes
|
apis/v1/controllers/document_controller.py
CHANGED
|
@@ -2,42 +2,46 @@ from typing import AnyStr
|
|
| 2 |
from fastapi import UploadFile, HTTPException, status, BackgroundTasks
|
| 3 |
import uuid
|
| 4 |
import time
|
|
|
|
| 5 |
from ..schemas.document_schema import DocSchema
|
| 6 |
|
| 7 |
-
|
| 8 |
-
|
| 9 |
-
|
| 10 |
-
|
| 11 |
-
|
| 12 |
-
|
| 13 |
-
|
| 14 |
-
|
| 15 |
-
|
| 16 |
-
|
| 17 |
-
|
| 18 |
-
|
| 19 |
-
|
| 20 |
-
|
| 21 |
-
|
| 22 |
-
|
| 23 |
-
|
| 24 |
-
|
| 25 |
-
|
| 26 |
-
|
| 27 |
-
|
| 28 |
-
|
| 29 |
-
|
| 30 |
-
|
| 31 |
-
|
| 32 |
-
|
| 33 |
-
|
| 34 |
-
|
| 35 |
-
|
| 36 |
-
|
| 37 |
-
|
| 38 |
-
|
| 39 |
-
|
| 40 |
-
|
| 41 |
-
|
| 42 |
-
|
| 43 |
-
|
|
|
|
|
|
|
|
|
|
|
|
| 2 |
from fastapi import UploadFile, HTTPException, status, BackgroundTasks
|
| 3 |
import uuid
|
| 4 |
import time
|
| 5 |
+
from ..providers import firebase_db
|
| 6 |
from ..schemas.document_schema import DocSchema
|
| 7 |
|
| 8 |
+
def upload_document(data):
|
| 9 |
+
"""
|
| 10 |
+
Upload a document
|
| 11 |
+
"""
|
| 12 |
+
try:
|
| 13 |
+
upload_document = firebase_db.upload_doc(data)
|
| 14 |
+
return upload_document
|
| 15 |
+
except Exception as e:
|
| 16 |
+
raise HTTPException(status_code=status.HTTP_500_INTERNAL_SERVER_ERROR, detail=str(e))
|
| 17 |
+
|
| 18 |
+
|
| 19 |
+
def get_document(document_id: AnyStr):
|
| 20 |
+
"""
|
| 21 |
+
Get a document
|
| 22 |
+
"""
|
| 23 |
+
try:
|
| 24 |
+
document = firebase_db.get_doc(document_id)
|
| 25 |
+
return document
|
| 26 |
+
except Exception as e:
|
| 27 |
+
raise HTTPException(status_code=status.HTTP_500_INTERNAL_SERVER_ERROR, detail=str(e))
|
| 28 |
+
|
| 29 |
+
def update_document(document_id: AnyStr, data):
|
| 30 |
+
"""
|
| 31 |
+
Update a document
|
| 32 |
+
"""
|
| 33 |
+
try:
|
| 34 |
+
update_document = firebase_db.update_doc(document_id, data)
|
| 35 |
+
return update_document
|
| 36 |
+
except Exception as e:
|
| 37 |
+
raise HTTPException(status_code=status.HTTP_500_INTERNAL_SERVER_ERROR, detail=str(e))
|
| 38 |
+
|
| 39 |
+
def delete_document(document_id: AnyStr):
|
| 40 |
+
"""
|
| 41 |
+
Delete a document
|
| 42 |
+
"""
|
| 43 |
+
try:
|
| 44 |
+
delete_document = firebase_db.delete_doc(document_id)
|
| 45 |
+
return delete_document
|
| 46 |
+
except Exception as e:
|
| 47 |
+
raise HTTPException(status_code=status.HTTP_500_INTERNAL_SERVER_ERROR, detail=str(e))
|
apis/v1/controllers/rag_controller.py
CHANGED
|
@@ -4,7 +4,7 @@ from langchain_core.runnables import RunnablePassthrough
|
|
| 4 |
from langchain_text_splitters import RecursiveCharacterTextSplitter
|
| 5 |
from langchain_core.prompts import PromptTemplate
|
| 6 |
from langchain_pinecone import PineconeVectorStore
|
| 7 |
-
from ..configs.
|
| 8 |
from ..configs.word_embedding_config import mxbai_embedder
|
| 9 |
from ..controllers.vectorstore_controller import create_vector_store
|
| 10 |
from ..utils.prompts import rag_prompt
|
|
|
|
| 4 |
from langchain_text_splitters import RecursiveCharacterTextSplitter
|
| 5 |
from langchain_core.prompts import PromptTemplate
|
| 6 |
from langchain_pinecone import PineconeVectorStore
|
| 7 |
+
from ..configs.llm_config import gemini_model as llm
|
| 8 |
from ..configs.word_embedding_config import mxbai_embedder
|
| 9 |
from ..controllers.vectorstore_controller import create_vector_store
|
| 10 |
from ..utils.prompts import rag_prompt
|
apis/v1/providers/__init__.py
CHANGED
|
@@ -1,3 +1,5 @@
|
|
| 1 |
from .vectorstore_provider import VectorStoreProvider
|
|
|
|
| 2 |
|
| 3 |
-
vectorstore_db = VectorStoreProvider()
|
|
|
|
|
|
| 1 |
from .vectorstore_provider import VectorStoreProvider
|
| 2 |
+
from .firebase_provider import FirebaseProvider
|
| 3 |
|
| 4 |
+
vectorstore_db = VectorStoreProvider()
|
| 5 |
+
firebase_db = FirebaseProvider()
|
apis/v1/providers/firebase_provider.py
ADDED
|
@@ -0,0 +1,87 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from ..configs.firebase_config import db
|
| 2 |
+
|
| 3 |
+
class FirebaseProvider:
|
| 4 |
+
def __init__(self):
|
| 5 |
+
self.collection_name = "Documents"
|
| 6 |
+
self.db = db
|
| 7 |
+
|
| 8 |
+
def upload_doc(self, data):
|
| 9 |
+
"""
|
| 10 |
+
Uploads a document to Firestore.
|
| 11 |
+
|
| 12 |
+
:param collection_name: Name of the Firestore collection
|
| 13 |
+
:param data: Dictionary containing the document data
|
| 14 |
+
:return: document is successfully uploaded, error otherwise
|
| 15 |
+
"""
|
| 16 |
+
try:
|
| 17 |
+
self.db.collection(self.collection_name).add(data)
|
| 18 |
+
return f"Document uploaded successfully to collection {self.collection_name}."
|
| 19 |
+
except Exception as e:
|
| 20 |
+
return (f"An error occurred: {e}")
|
| 21 |
+
|
| 22 |
+
def get_doc(self, document_id):
|
| 23 |
+
"""
|
| 24 |
+
Retrieves a document from Firestore by collection name and document ID.
|
| 25 |
+
|
| 26 |
+
:param collection_name: Name of the Firestore collection
|
| 27 |
+
:param document_id: ID of the Firestore document
|
| 28 |
+
:return: Dictionary containing the document data or None if document is not found
|
| 29 |
+
"""
|
| 30 |
+
try:
|
| 31 |
+
doc_ref = self.db.collection(self.collection_name).document(document_id)
|
| 32 |
+
doc = doc_ref.get()
|
| 33 |
+
if doc.exists:
|
| 34 |
+
print(f"Document with ID {document_id} retrieved successfully from collection {self.collection_name}.")
|
| 35 |
+
return doc.to_dict()
|
| 36 |
+
else:
|
| 37 |
+
print(f"No document found with ID {document_id} in collection {self.collection_name}.")
|
| 38 |
+
return None
|
| 39 |
+
except Exception as e:
|
| 40 |
+
print(f"An error occurred: {e}")
|
| 41 |
+
return None
|
| 42 |
+
|
| 43 |
+
def delete_doc(self, document_id):
|
| 44 |
+
"""
|
| 45 |
+
Deletes a document from Firestore by collection name and document ID.
|
| 46 |
+
|
| 47 |
+
:param collection_name: Name of the Firestore collection
|
| 48 |
+
:param document_id: ID of the Firestore document
|
| 49 |
+
:return: document is successfully deleted, False otherwise
|
| 50 |
+
"""
|
| 51 |
+
try:
|
| 52 |
+
doc_ref = self.db.collection(self.collection_name).document(document_id)
|
| 53 |
+
doc = doc_ref.get()
|
| 54 |
+
if doc.exists:
|
| 55 |
+
doc_ref.delete()
|
| 56 |
+
return f"Document with ID {document_id} deleted successfully from collection {self.collection_name}."
|
| 57 |
+
else:
|
| 58 |
+
print(f"No document found with ID {document_id} in collection {self.collection_name}.")
|
| 59 |
+
return False
|
| 60 |
+
except Exception as e:
|
| 61 |
+
print(f"An error occurred: {e}")
|
| 62 |
+
return False
|
| 63 |
+
|
| 64 |
+
def update_doc(self, document_id, data):
|
| 65 |
+
"""
|
| 66 |
+
Updates a document in Firestore by collection name and document ID.
|
| 67 |
+
|
| 68 |
+
:param collection_name: Name of the Firestore collection
|
| 69 |
+
:param document_id: ID of the Firestore document
|
| 70 |
+
:param data: Dictionary containing the updated document data
|
| 71 |
+
:return: document is successfully updated, error otherwise
|
| 72 |
+
"""
|
| 73 |
+
|
| 74 |
+
try:
|
| 75 |
+
doc_ref = self.db.collection(self.collection_name).document(document_id)
|
| 76 |
+
doc = doc_ref.get()
|
| 77 |
+
if doc.exists:
|
| 78 |
+
doc_ref.update(data)
|
| 79 |
+
return f"Document with ID {document_id} updated successfully in collection {self.collection_name}."
|
| 80 |
+
else:
|
| 81 |
+
print(f"No document found with ID {document_id} in collection {self.collection_name}.")
|
| 82 |
+
return False
|
| 83 |
+
except Exception as e:
|
| 84 |
+
print(f"An error occurred: {e}")
|
| 85 |
+
return False
|
| 86 |
+
|
| 87 |
+
|
apis/v1/routes/documents.py
CHANGED
|
@@ -2,40 +2,41 @@ from typing import Annotated
|
|
| 2 |
from io import BytesIO
|
| 3 |
from pydantic import BaseModel, Field
|
| 4 |
from fastapi import APIRouter, Depends, BackgroundTasks
|
|
|
|
| 5 |
from ..interfaces.document_interface import DocumentUploadResponseInterface
|
| 6 |
from ..utils.response_fmt import jsonResponseFmt
|
| 7 |
|
| 8 |
router = APIRouter(prefix="/documents", tags=["Documents"])
|
| 9 |
|
| 10 |
|
| 11 |
-
|
| 12 |
-
|
| 13 |
-
|
| 14 |
-
|
| 15 |
-
|
| 16 |
-
|
| 17 |
-
|
| 18 |
|
| 19 |
@router.post("/upload", response_model=DocumentUploadResponseInterface)
|
| 20 |
-
async def
|
| 21 |
"""
|
| 22 |
Upload a document
|
| 23 |
"""
|
| 24 |
-
|
| 25 |
-
return jsonResponseFmt(
|
| 26 |
|
| 27 |
-
|
| 28 |
-
|
| 29 |
-
|
| 30 |
-
|
| 31 |
-
|
| 32 |
-
|
| 33 |
-
|
| 34 |
|
| 35 |
-
|
| 36 |
-
|
| 37 |
-
|
| 38 |
-
|
| 39 |
-
|
| 40 |
-
|
| 41 |
-
|
|
|
|
| 2 |
from io import BytesIO
|
| 3 |
from pydantic import BaseModel, Field
|
| 4 |
from fastapi import APIRouter, Depends, BackgroundTasks
|
| 5 |
+
from ..controllers.document_controller import upload_document, get_document, update_document, delete_document
|
| 6 |
from ..interfaces.document_interface import DocumentUploadResponseInterface
|
| 7 |
from ..utils.response_fmt import jsonResponseFmt
|
| 8 |
|
| 9 |
router = APIRouter(prefix="/documents", tags=["Documents"])
|
| 10 |
|
| 11 |
|
| 12 |
+
@router.get("/{document_id}", response_model=DocumentUploadResponseInterface)
|
| 13 |
+
async def get_doc(document_id: str):
|
| 14 |
+
"""
|
| 15 |
+
Get a document
|
| 16 |
+
"""
|
| 17 |
+
document = get_document(document_id)
|
| 18 |
+
return jsonResponseFmt(document,"Document retrieved successfully")
|
| 19 |
|
| 20 |
@router.post("/upload", response_model=DocumentUploadResponseInterface)
|
| 21 |
+
async def upload_doc(data: dict):
|
| 22 |
"""
|
| 23 |
Upload a document
|
| 24 |
"""
|
| 25 |
+
document = upload_document(data)
|
| 26 |
+
return jsonResponseFmt(document,"Document uploaded successfully")
|
| 27 |
|
| 28 |
+
@router.put("/{document_id}", response_model=DocumentUploadResponseInterface)
|
| 29 |
+
async def update_doc(document_id: str, data: dict):
|
| 30 |
+
"""
|
| 31 |
+
Update a document
|
| 32 |
+
"""
|
| 33 |
+
document = update_document(document_id, data)
|
| 34 |
+
return jsonResponseFmt(document,"Document updated successfully")
|
| 35 |
|
| 36 |
+
@router.delete("/{document_id}", response_model=DocumentUploadResponseInterface)
|
| 37 |
+
async def delete_doc(document_id: str):
|
| 38 |
+
"""
|
| 39 |
+
Delete a document
|
| 40 |
+
"""
|
| 41 |
+
document = delete_document(document_id)
|
| 42 |
+
return jsonResponseFmt(document,"Document deleted successfully")
|
app.py
CHANGED
|
@@ -30,3 +30,5 @@ if st.button("Get Summary"):
|
|
| 30 |
st.write(response.json())
|
| 31 |
else:
|
| 32 |
st.warning("Please upload a file and enter a question.")
|
|
|
|
|
|
|
|
|
| 30 |
st.write(response.json())
|
| 31 |
else:
|
| 32 |
st.warning("Please upload a file and enter a question.")
|
| 33 |
+
|
| 34 |
+
|
test.py
DELETED
|
@@ -1,54 +0,0 @@
|
|
| 1 |
-
from langchain_google_genai import ChatGoogleGenerativeAI
|
| 2 |
-
from dotenv import load_dotenv
|
| 3 |
-
import os
|
| 4 |
-
from langchain_community.document_loaders import PyPDFLoader
|
| 5 |
-
from langchain import hub
|
| 6 |
-
from langchain_chroma import Chroma
|
| 7 |
-
from langchain_community.document_loaders import WebBaseLoader
|
| 8 |
-
from langchain_core.output_parsers import StrOutputParser
|
| 9 |
-
from langchain_core.runnables import RunnablePassthrough
|
| 10 |
-
from langchain_openai import OpenAIEmbeddings
|
| 11 |
-
from langchain_text_splitters import RecursiveCharacterTextSplitter
|
| 12 |
-
from langchain_huggingface import HuggingFaceEmbeddings
|
| 13 |
-
from langchain_core.prompts import PromptTemplate
|
| 14 |
-
from apis.v1.utils.prompts import rag_prompt
|
| 15 |
-
|
| 16 |
-
load_dotenv()
|
| 17 |
-
|
| 18 |
-
mxbai_embedder = HuggingFaceEmbeddings(model_name="mixedbread-ai/mxbai-embed-large-v1")
|
| 19 |
-
|
| 20 |
-
|
| 21 |
-
llm = ChatGoogleGenerativeAI(google_api_key=os.environ.get("GOOGLE_API_KEY"),
|
| 22 |
-
model="gemini-1.5-pro-latest")
|
| 23 |
-
|
| 24 |
-
# Load and split the PDF document into pages
|
| 25 |
-
pdf_loader = PyPDFLoader("14014749.pdf")
|
| 26 |
-
# print(pdf_loader)
|
| 27 |
-
pages = pdf_loader.load_and_split()
|
| 28 |
-
# print(pages)
|
| 29 |
-
# Split the pages into smaller chunks
|
| 30 |
-
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
|
| 31 |
-
splits = text_splitter.split_documents(pages)
|
| 32 |
-
print(splits)
|
| 33 |
-
# Create a vector store from the document splits
|
| 34 |
-
vectorstore = Chroma.from_documents(documents=splits, embedding=mxbai_embedder)
|
| 35 |
-
|
| 36 |
-
# Retrieve and generate using the relevant snippets of the blog
|
| 37 |
-
retriever = vectorstore.as_retriever()
|
| 38 |
-
print(retriever)
|
| 39 |
-
custom_rag_prompt = PromptTemplate.from_template(rag_prompt)
|
| 40 |
-
print(custom_rag_prompt)
|
| 41 |
-
def format_docs(docs):
|
| 42 |
-
return "\n\n".join(doc.page_content for doc in docs)
|
| 43 |
-
|
| 44 |
-
# Define the RAG chain
|
| 45 |
-
rag_chain = (
|
| 46 |
-
{"context": retriever | format_docs, "question": RunnablePassthrough()}
|
| 47 |
-
| custom_rag_prompt
|
| 48 |
-
| llm
|
| 49 |
-
| StrOutputParser()
|
| 50 |
-
)
|
| 51 |
-
|
| 52 |
-
# Invoke the RAG chain with a question
|
| 53 |
-
response = rag_chain.invoke("Can you summarize the document?")
|
| 54 |
-
print(response)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|