|
|
import os |
|
|
from langchain_text_splitters import CharacterTextSplitter |
|
|
from langchain_community.embeddings import HuggingFaceEmbeddings |
|
|
from langchain.schema import HumanMessage |
|
|
from langchain.document_loaders import UnstructuredFileLoader |
|
|
|
|
|
from langchain_community.vectorstores import Chroma |
|
|
from langchain_groq import ChatGroq |
|
|
import gradio as gr |
|
|
|
|
|
|
|
|
DB_DIR = "chroma_db" |
|
|
COLLECTION_NAME = "document_collection" |
|
|
embedding_function = HuggingFaceEmbeddings() |
|
|
|
|
|
GROQ_API_KEY = groq_api_key = os.environ.get("GROQ_API_KEY") |
|
|
llm = ChatGroq(api_key=GROQ_API_KEY, model_name="llama-3.1-8b-instant") |
|
|
|
|
|
|
|
|
current_document_id = None |
|
|
|
|
|
def load_and_split_document(file_path): |
|
|
"""Loads a document and splits it into chunks.""" |
|
|
loader = UnstructuredFileLoader(file_path) |
|
|
documents = loader.load() |
|
|
|
|
|
text_splitter = CharacterTextSplitter(chunk_size=400, chunk_overlap=50) |
|
|
chunks = text_splitter.split_documents(documents) |
|
|
|
|
|
return chunks |
|
|
|
|
|
def upload_and_process(file): |
|
|
"""Processes uploaded file and stores it in ChromaDB.""" |
|
|
try: |
|
|
global current_document_id |
|
|
uploaded_file_path = file.name |
|
|
|
|
|
|
|
|
current_document_id = os.path.basename(uploaded_file_path) |
|
|
|
|
|
|
|
|
chunks = load_and_split_document(uploaded_file_path) |
|
|
|
|
|
|
|
|
for chunk in chunks: |
|
|
chunk.metadata['document_id'] = current_document_id |
|
|
|
|
|
|
|
|
vector_store = Chroma( |
|
|
persist_directory=DB_DIR, |
|
|
embedding_function=embedding_function, |
|
|
collection_name=COLLECTION_NAME |
|
|
) |
|
|
|
|
|
|
|
|
vector_store.add_documents(chunks) |
|
|
|
|
|
return f"Document successfully processed: {current_document_id}" |
|
|
except Exception as e: |
|
|
return f"Error processing document: {str(e)}" |
|
|
|
|
|
def retrieve_and_generate_response(query): |
|
|
"""Retrieves relevant text and uses Groq LLM to generate a response.""" |
|
|
try: |
|
|
vector_store = Chroma( |
|
|
persist_directory=DB_DIR, |
|
|
embedding_function=embedding_function, |
|
|
collection_name=COLLECTION_NAME |
|
|
) |
|
|
|
|
|
|
|
|
if current_document_id: |
|
|
filter_dict = {"document_id": current_document_id} |
|
|
results = vector_store.similarity_search( |
|
|
query, |
|
|
k=2, |
|
|
filter=filter_dict |
|
|
) |
|
|
else: |
|
|
return "Please upload a document first." |
|
|
|
|
|
retrieved_texts = [doc.page_content for doc in results] |
|
|
context = "\n".join(retrieved_texts) |
|
|
|
|
|
if not context: |
|
|
return "No relevant content found in the current document." |
|
|
|
|
|
messages = [ |
|
|
HumanMessage(content=f"Use the following context to answer the question:\n\n{context}\n\nQuestion: {query}") |
|
|
] |
|
|
|
|
|
response = llm.invoke(messages) |
|
|
return response.content |
|
|
except Exception as e: |
|
|
return f"Error generating response: {str(e)}" |
|
|
|
|
|
|
|
|
with gr.Blocks() as demo: |
|
|
gr.Markdown("# 🤖 RAG Chatbot with Groq & ChromaDB") |
|
|
|
|
|
file_input = gr.File(label="Upload a PDF") |
|
|
upload_button = gr.Button("Process Document") |
|
|
upload_status = gr.Textbox(label="Upload Status", interactive=False) |
|
|
|
|
|
query_input = gr.Textbox(label="Ask a Question") |
|
|
response_output = gr.Textbox(label="Response", interactive=False) |
|
|
|
|
|
chat_button = gr.Button("Get Answer") |
|
|
|
|
|
upload_button.click( |
|
|
upload_and_process, |
|
|
inputs=[file_input], |
|
|
outputs=[upload_status] |
|
|
) |
|
|
chat_button.click( |
|
|
retrieve_and_generate_response, |
|
|
inputs=[query_input], |
|
|
outputs=[response_output] |
|
|
) |
|
|
|
|
|
|
|
|
|
|
|
demo.launch() |
|
|
|