Spaces:
Build error
Build error
File size: 7,341 Bytes
1b7e795 7b52c77 1b7e795 7b52c77 1b7e795 7b52c77 1b7e795 7b52c77 1b7e795 7b52c77 1b7e795 7b52c77 1b7e795 7b52c77 1b7e795 7b52c77 1b7e795 7b52c77 1b7e795 7b52c77 1b7e795 7b52c77 1b7e795 7b52c77 1b7e795 7b52c77 1b7e795 7b52c77 a0dc409 7b52c77 a0dc409 7b52c77 a0dc409 7b52c77 a0dc409 7b52c77 a0dc409 7b52c77 a0dc409 7b52c77 a0dc409 1b7e795 7b52c77 1b7e795 7b52c77 1b7e795 7b52c77 1b7e795 7b52c77 1b7e795 7b52c77 1b7e795 a0dc409 7b52c77 1b7e795 7b52c77 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 |
import os
import gradio as gr
import numpy as np
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.document_loaders import DirectoryLoader, PyPDFLoader
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores import FAISS
from langchain.chains import ConversationalRetrievalChain
from langchain.memory import ConversationBufferMemory
from langchain.prompts import PromptTemplate
from langchain_community.llms import HuggingFaceHub
from dotenv import load_dotenv
from langchain_openai import ChatOpenAI
import shutil
# Define directory variable
load_dotenv(dotenv_path=os.path.join(os.getcwd(), '.env'))
DOCUMENTS_DIR = "documents"
# Set up environment variables for HuggingFace
huggingface_token = os.getenv("HUGGINGFACE_API_TOKEN")
os.environ["OPENAI_API_KEY"] = os.getenv('OPENAI_API_KEY')
if huggingface_token:
os.environ["HUGGINGFACEHUB_API_TOKEN"] = huggingface_token
# # Remove the existing documents directory if it exists
# if os.path.exists(DOCUMENTS_DIR):
# shutil.rmtree(DOCUMENTS_DIR)
llm = ChatOpenAI(temperature=0.7, model_name="gpt-3.5-turbo")
# Create a directory for document storage if it doesn't exist
os.makedirs(DOCUMENTS_DIR, exist_ok=True)
# Function to load documents
def load_documents(directory=DOCUMENTS_DIR):
print("Entered load documents")
documents = []
# Find all PDF files
pdf_files = []
for root, _, files in os.walk(directory):
for file in files:
if file.lower().endswith('.pdf'):
pdf_files.append(os.path.join(root, file))
print(f"Found {len(pdf_files)} PDF files")
# Process each PDF with error handling
for pdf_path in pdf_files:
try:
print(f"Processing {pdf_path}")
loader = PyPDFLoader(pdf_path)
file_documents = loader.load()
documents.extend(file_documents)
print(f"Successfully loaded {pdf_path}")
except Exception as e:
print(f"Failed to load {pdf_path}: {str(e)}")
print(f"Successfully loaded {len(documents)} documents")
return documents
# Function to process documents and create vector store
def process_documents():
documents = load_documents()
# Split documents into chunks
text_splitter = RecursiveCharacterTextSplitter(
chunk_size=400,
chunk_overlap=150
)
chunks = text_splitter.split_documents(documents)
# Create embeddings
embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
# Create vector store
vector_store = FAISS.from_documents(chunks, embeddings)
return vector_store
# Create RAG chain
def create_chain(vector_store):
if not os.getenv("HUGGINGFACEHUB_API_TOKEN"):
return None
# llm = HuggingFaceHub(
# repo_id="google/flan-t5-large",
# model_kwargs={"temperature": 0.5, "max_length": 512}
# )
memory = ConversationBufferMemory(
memory_key="chat_history",
return_messages=True
)
qa_prompt = PromptTemplate.from_template("""
You are a helpful assistant for answering questions about documents.
Context information is below.
---------------------
{context}
---------------------
Given the context information and not prior knowledge, answer the question: {question}
If the context is not provided, please respond saying, no context was found
""")
chain = ConversationalRetrievalChain.from_llm(
llm=llm,
retriever=vector_store.as_retriever(search_kwargs={"k": 3}),
memory=memory,
combine_docs_chain_kwargs={"prompt": qa_prompt}
)
return chain
# Initialize variables for handling chat state
vector_store = None
chain = None
chat_history = []
# Function to handle file uploads
import shutil
def upload_file(files):
print("Entered file processing:")
print(files)
try:
# Clear existing documents if uploading new ones
for f in os.listdir(DOCUMENTS_DIR):
file_path = os.path.join(DOCUMENTS_DIR, f)
if os.path.isfile(file_path):
os.remove(file_path)
# Process uploaded files
for file in files:
if isinstance(file, str) and os.path.isfile(file):
file_name = os.path.basename(file)
dest_path = os.path.join(DOCUMENTS_DIR, file_name)
shutil.copy(file, dest_path)
print(f"Copied {file} to {dest_path}")
else:
return f"Invalid file format or file not found: {file}"
# Process documents and create vector store
global vector_store, chain
vector_store = process_documents()
chain = create_chain(vector_store)
if chain is None:
return "Files uploaded and processed, but HuggingFace API token is missing. Set the environment variable to enable the chatbot."
return "Files uploaded and processed successfully!"
except Exception as e:
return f"Error processing files: {str(e)}"
# Function to handle user queries
def chat(message, history):
global chain, chat_history, vector_store
if vector_store is None:
if os.path.exists(DOCUMENTS_DIR) and any(os.path.isfile(os.path.join(DOCUMENTS_DIR, f)) for f in os.listdir(DOCUMENTS_DIR)):
vector_store = process_documents()
chain = create_chain(vector_store)
else:
return history + [[message, "Please upload documents first to initialize the chatbot."]]
if chain is None:
return history + [[message, "HuggingFace API token is not set. Please set the HUGGINGFACE_API_TOKEN environment variable."]]
try:
if history:
chat_history = [(turn[0], turn[1]) for turn in history]
response = chain({"question": message})
answer = response['answer']
return history + [[message, answer]]
except Exception as e:
error_message = f"Error processing your request: {str(e)}"
return history + [[message, error_message]]
# Create Gradio interface
with gr.Blocks(title="RAG Chatbot") as demo:
gr.Markdown("# RAG-based Conversational Chatbot")
gr.Markdown("Upload text documents and chat with an AI that can answer questions based on their content.")
with gr.Row():
with gr.Column(scale=1):
file_output = gr.Textbox(label="Upload Status")
file_input = gr.File(
file_count="multiple",
label="Upload Documents (.txt files)",
type="filepath"
)
upload_button = gr.Button("Process Documents")
upload_button.click(upload_file, inputs=[file_input], outputs=[file_output])
with gr.Column(scale=2):
chatbot = gr.Chatbot(height=400)
msg = gr.Textbox(label="Ask a question about your documents")
msg.submit(chat, inputs=[msg, chatbot], outputs=[chatbot])
clear = gr.Button("Clear")
clear.click(lambda: [], outputs=[chatbot])
# Launch the app
if __name__ == "__main__":
demo.launch()
|