rag-chroma / app.py
pratikshahp's picture
Update app.py
cbef0cf verified
import streamlit as st
import fitz # PyMuPDF
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
from langchain.vectorstores import Chroma
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_text_splitters import RecursiveCharacterTextSplitter
import os
from dotenv import load_dotenv
#hf_api_key = os.getenv("HF_TOKEN")
model_name = "openai-community/gpt2"
#model_name = "google/gemma-2-9b"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name)#,use_auth_token=hf_api_key)
def get_llm_response(input_prompt, content, prompt):
combined_input = f"{input_prompt}\nContent: {content}\nQuestion: {prompt}\nAnswer:"
inputs = tokenizer(combined_input, return_tensors="pt")
outputs = model.generate(**inputs, max_length=400, num_return_sequences=1)
response = tokenizer.decode(outputs[0], skip_special_tokens=True)
# Extract the answer part from the response
answer_start = response.find("Answer:") + len("Answer:")
answer = response[answer_start:].strip()
return answer
# Function to extract text from PDF file
def extract_text_from_pdf(file):
try:
doc = fitz.open(stream=file.read(), filetype="pdf")
text = ""
for page in doc:
text += page.get_text()
return text
except Exception as e:
st.error(f"Error occurred while reading PDF file: {e}")
return ""
# Main function
def main():
# Set title and description
st.title("PDF Chatbot")
# Create a sidebar for file upload
st.sidebar.title("Upload PDF File")
uploaded_file = st.sidebar.file_uploader("Choose a PDF file", type=['pdf'])
# Text input for prompt
prompt = st.text_input("Ask a Question", "")
# Submit button
submitted = st.button("Submit")
if submitted:
if uploaded_file is not None:
# Extract text from uploaded PDF file
pdf_text = extract_text_from_pdf(uploaded_file)
#st.write(pdf_text)
if pdf_text:
try:
# Create embeddings
embeddings = HuggingFaceEmbeddings()
# Split text into chunks
text_splitter = RecursiveCharacterTextSplitter(
chunk_size=1000,
chunk_overlap=20,
length_function=len,
is_separator_regex=False,
)
chunks = text_splitter.create_documents([pdf_text])
# Store chunks in ChromaDB
persist_directory = 'pdf_embeddings'
vectordb = Chroma.from_documents(documents=chunks, embedding=embeddings, persist_directory=persist_directory)
vectordb.persist() # Persist ChromaDB
st.write("Embeddings stored successfully in ChromaDB.")
st.write(f"Persist directory: {persist_directory}")
# st.write(vectordb)
# Load persisted Chroma database
vectordb = Chroma(persist_directory=persist_directory, embedding_function=embeddings)
#st.write(vectordb)
# Perform question answering
if prompt:
docs = vectordb.similarity_search(prompt)
# st.write(docs[0])
text = docs[0].page_content
input_prompt = "You are an expert in understanding text contents. You will receive an input PDF file and you will have to answer questions based on the input file."
response = get_llm_response(input_prompt, text, prompt)
st.subheader("Generated Answer:")
st.write(response)
except Exception as e:
st.error(f"Error occurred during text processing: {e}")
if __name__ == "__main__":
main()