Spaces:

sumanthkv
/

pdf

Sleeping

App Files Files Community

sumanthkv commited on Apr 5, 2024

Commit

3a6de21

verified ·

1 Parent(s): 149797b

Upload app.py

Browse files

Files changed (1) hide show

app.py +97 -0

app.py ADDED Viewed

	@@ -0,0 +1,97 @@

+import streamlit as st
+from langchain.document_loaders import PyPDFLoader
+from langchain.document_loaders import TextLoader
+from langchain.document_loaders import Docx2txtLoader
+from langchain.text_splitter import CharacterTextSplitter
+from langchain.embeddings import HuggingFaceEmbeddings
+from langchain.vectorstores import Chroma
+from huggingface_hub import notebook_login
+import torch
+import transformers
+from transformers import AutoTokenizer, AutoModelForCausalLM
+from transformers import pipeline
+from langchain import HuggingFacePipeline
+from langchain.chains import ConversationalRetrievalChain
+from langchain.memory import ConversationBufferMemory
+from langchain.embeddings.openai import OpenAIEmbeddings
+from langchain.chat_models import ChatOpenAI
+import os
+import sys
+# Create a directory for documents if it doesn't exist
+if not os.path.exists("docs"):
+    os.makedirs("docs")
+# Define a function to load documents from the "docs" directory
+def load_documents():
+    document = []
+    for file in os.listdir("docs"):
+        if file.endswith(".pdf"):
+            pdf_path = "./docs/" + file
+            loader = PyPDFLoader(pdf_path)
+            document.extend(loader.load())
+        elif file.endswith('.docx') or file.endswith('.doc'):
+            doc_path = "./docs/" + file
+            loader = Docx2txtLoader(doc_path)
+            document.extend(loader.load())
+        elif file.endswith('.txt'):
+            text_path = "./docs/" + file
+            loader = TextLoader(text_path)
+            document.extend(loader.load())
+    return document
+# Load documents
+document = load_documents()
+# Split documents into chunks
+document_splitter = CharacterTextSplitter(separator='\n', chunk_size=500, chunk_overlap=100)
+document_chunks = document_splitter.split_documents(document)
+# Initialize embeddings
+embeddings = HuggingFaceEmbeddings(model_name='sentence-transformers/all-MiniLM-L6-v2')
+# Set OpenAI API key
+os.environ["OPENAI_API_KEY"] = "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"
+# Initialize Chroma as the vector database
+vectordb = Chroma.from_documents(document_chunks, embedding=embeddings, persist_directory='./data')
+vectordb.persist()
+# Login to Hugging Face Hub
+notebook_login()
+# Initialize tokenizer and model for text generation
+tokenizer = AutoTokenizer.from_pretrained("google-bert/bert-base-uncased", use_auth_token=True)
+model = AutoModelForCausalLM.from_pretrained("google/gemma-7b", torch_dtype=torch.float16, device_map="auto")
+# Initialize the text generation pipeline
+pipe = pipeline("text-generation", model=model, tokenizer=tokenizer, torch_dtype=torch.bfloat16, device_map='auto',
+                max_new_tokens=512, min_new_tokens=-1, top_k=30)
+# Initialize the conversational retrieval chain
+llm = HuggingFacePipeline(pipeline=pipe, model_kwargs={'temperature': 0})
+llm = ChatOpenAI(temperature=0.7, model_name='gpt-3.5-turbo')
+memory = ConversationBufferMemory(memory_key='chat_history', return_messages=True)
+pdf_qa = ConversationalRetrievalChain.from_llm(llm=llm, retriever=vectordb.as_retriever(search_kwargs={'k': 6}),
+                                                verbose=False, memory=memory)
+# Streamlit app
+st.title('DocBot - Your Document Query Assistant')
+st.write('Upload your documents to get started.')
+uploaded_files = st.file_uploader("Upload Files", type=['pdf', 'docx', 'doc', 'txt'], accept_multiple_files=True)
+if uploaded_files:
+    st.write("Uploaded Files:")
+    for file in uploaded_files:
+        with open(os.path.join("docs", file.name), "wb") as f:
+            f.write(file.getbuffer())
+    st.write("Files uploaded successfully. You can start asking questions now.")
+while True:
+    query = st.text_input("Ask a question:")
+    if query:
+        result = pdf_qa({"question": query})
+        st.write("Answer: " + result["answer"])