Spaces:

avimittal30
/

conversational_rag

Build error

App Files Files Community

avimittal30 commited on Apr 26, 2025

Commit

7b52c77

1 Parent(s): a0dc409

code corrected

Browse files

Files changed (2) hide show

app.py +93 -62
requirements.txt +4 -1

app.py CHANGED Viewed

@@ -2,27 +2,62 @@ import os
 import gradio as gr
 import numpy as np
 from langchain.text_splitter import RecursiveCharacterTextSplitter
-from langchain.document_loaders import DirectoryLoader, TextLoader
 from langchain_community.embeddings import HuggingFaceEmbeddings
 from langchain_community.vectorstores import FAISS
 from langchain.chains import ConversationalRetrievalChain
 from langchain.memory import ConversationBufferMemory
 from langchain_community.llms import HuggingFaceHub
-# Set up environment variables for HuggingFace - safely handle potential None value
 huggingface_token = os.getenv("HUGGINGFACE_API_TOKEN")
 if huggingface_token:
     os.environ["HUGGINGFACEHUB_API_TOKEN"] = huggingface_token
-else:
-    print("Warning: HUGGINGFACE_API_TOKEN environment variable not set. You'll need to set it for the LLM to work.")
 # Create a directory for document storage if it doesn't exist
-os.makedirs("documents", exist_ok=True)
 # Function to load documents
-def load_documents(directory="documents"):
-    loader = DirectoryLoader(directory, glob="**/*.txt", loader_cls=TextLoader)
-    documents = loader.load()
     return documents
 # Function to process documents and create vector store
@@ -31,8 +66,8 @@ def process_documents():
     # Split documents into chunks
     text_splitter = RecursiveCharacterTextSplitter(
-        chunk_size=1000,
-        chunk_overlap=200
     )
     chunks = text_splitter.split_documents(documents)
@@ -46,27 +81,36 @@ def process_documents():
 # Create RAG chain
 def create_chain(vector_store):
-    # Check if API token is available
     if not os.getenv("HUGGINGFACEHUB_API_TOKEN"):
         return None
-    # Initialize the LLM
-    llm = HuggingFaceHub(
-        repo_id="google/flan-t5-large",
-        model_kwargs={"temperature": 0.5, "max_length": 512}
-    )
-    # Create memory for the conversation
     memory = ConversationBufferMemory(
         memory_key="chat_history",
         return_messages=True
     )
-    # Create the conversational chain
     chain = ConversationalRetrievalChain.from_llm(
         llm=llm,
         retriever=vector_store.as_retriever(search_kwargs={"k": 3}),
-        memory=memory
     )
     return chain
@@ -76,96 +120,83 @@ vector_store = None
 chain = None
 chat_history = []
-# Function to handle file uploads - FIXED to handle Gradio's file objects properly
 def upload_file(files):
     try:
-        # Clear existing documents if we're uploading new ones
-        for f in os.listdir("documents"):
-            file_path = os.path.join("documents", f)
             if os.path.isfile(file_path):
                 os.remove(file_path)
-        # Handle the uploaded files
         for file in files:
-            # Get file content and name
-            if hasattr(file, "name"):  # For standard file objects
-                file_name = os.path.basename(file.name)
-                if hasattr(file, "read"):
-                    content = file.read()
-                else:  # For NamedString objects
-                    content = file.decode('utf-8') if isinstance(file, bytes) else str(file)
-            else:  # For tuple format (file_name, file_data)
-                if isinstance(file, tuple) and len(file) >= 2:
-                    file_name = os.path.basename(file[0])
-                    content = file[1]
-                else:
-                    # If none of the above, try to handle as string with a default name
-                    file_name = f"document_{len(os.listdir('documents'))}.txt"
-                    content = str(file)
-            # Write content to file
-            file_path = os.path.join("documents", file_name)
-            with open(file_path, "w", encoding='utf-8') as f:
-                f.write(content if isinstance(content, str) else content.decode('utf-8'))
         global vector_store, chain
         vector_store = process_documents()
         chain = create_chain(vector_store)
         if chain is None:
             return "Files uploaded and processed, but HuggingFace API token is missing. Set the environment variable to enable the chatbot."
         return "Files uploaded and processed successfully!"
     except Exception as e:
         return f"Error processing files: {str(e)}"
 # Function to handle user queries
 def chat(message, history):
     global chain, chat_history, vector_store
-    # Check if documents exist
     if vector_store is None:
-        if os.path.exists("documents") and any(os.path.isfile(os.path.join("documents", f)) for f in os.listdir("documents")):
             vector_store = process_documents()
             chain = create_chain(vector_store)
         else:
-            # Return in the format expected by Gradio chatbot
             return history + [[message, "Please upload documents first to initialize the chatbot."]]
-    # Check if API token is set
     if chain is None:
-        # Return in the format expected by Gradio chatbot
         return history + [[message, "HuggingFace API token is not set. Please set the HUGGINGFACE_API_TOKEN environment variable."]]
-    # Process the message with the chain
     try:
-        # Convert history to format expected by chain
         if history:
             chat_history = [(turn[0], turn[1]) for turn in history]
-        # Get response from chain
         response = chain({"question": message})
         answer = response['answer']
-        # Return in the format expected by Gradio chatbot
         return history + [[message, answer]]
     except Exception as e:
-        # Handle any errors
         error_message = f"Error processing your request: {str(e)}"
         return history + [[message, error_message]]
-# Create Gradio interface - UPDATED to use a simpler file upload approach
 with gr.Blocks(title="RAG Chatbot") as demo:
     gr.Markdown("# RAG-based Conversational Chatbot")
     gr.Markdown("Upload text documents and chat with an AI that can answer questions based on their content.")
     with gr.Row():
         with gr.Column(scale=1):
             file_output = gr.Textbox(label="Upload Status")
             file_input = gr.File(
                 file_count="multiple",
                 label="Upload Documents (.txt files)",
-                type="text"  # Specifying text type for proper handling
             )
             upload_button = gr.Button("Process Documents")
             upload_button.click(upload_file, inputs=[file_input], outputs=[file_output])
@@ -180,4 +211,4 @@ with gr.Blocks(title="RAG Chatbot") as demo:
 # Launch the app
 if __name__ == "__main__":
-    demo.launch()

 import gradio as gr
 import numpy as np
 from langchain.text_splitter import RecursiveCharacterTextSplitter
+from langchain.document_loaders import DirectoryLoader, PyPDFLoader
 from langchain_community.embeddings import HuggingFaceEmbeddings
 from langchain_community.vectorstores import FAISS
 from langchain.chains import ConversationalRetrievalChain
 from langchain.memory import ConversationBufferMemory
+from langchain.prompts import PromptTemplate
 from langchain_community.llms import HuggingFaceHub
+from dotenv import load_dotenv
+from langchain_openai import ChatOpenAI
+import shutil
+# Define directory variable
+load_dotenv(dotenv_path=os.path.join(os.getcwd(), '.env'))
+DOCUMENTS_DIR = "documents"
+# Set up environment variables for HuggingFace
 huggingface_token = os.getenv("HUGGINGFACE_API_TOKEN")
+os.environ["OPENAI_API_KEY"] = os.getenv('OPENAI_API_KEY')
 if huggingface_token:
     os.environ["HUGGINGFACEHUB_API_TOKEN"] = huggingface_token
+# # Remove the existing documents directory if it exists
+# if os.path.exists(DOCUMENTS_DIR):
+#     shutil.rmtree(DOCUMENTS_DIR)
+llm = ChatOpenAI(temperature=0.7, model_name="gpt-3.5-turbo")
 # Create a directory for document storage if it doesn't exist
+os.makedirs(DOCUMENTS_DIR, exist_ok=True)
 # Function to load documents
+def load_documents(directory=DOCUMENTS_DIR):
+    print("Entered load documents")
+    documents = []
+    # Find all PDF files
+    pdf_files = []
+    for root, _, files in os.walk(directory):
+        for file in files:
+            if file.lower().endswith('.pdf'):
+                pdf_files.append(os.path.join(root, file))
+    print(f"Found {len(pdf_files)} PDF files")
+    # Process each PDF with error handling
+    for pdf_path in pdf_files:
+        try:
+            print(f"Processing {pdf_path}")
+            loader = PyPDFLoader(pdf_path)
+            file_documents = loader.load()
+            documents.extend(file_documents)
+            print(f"Successfully loaded {pdf_path}")
+        except Exception as e:
+            print(f"Failed to load {pdf_path}: {str(e)}")
+    print(f"Successfully loaded {len(documents)} documents")
     return documents
 # Function to process documents and create vector store
     # Split documents into chunks
     text_splitter = RecursiveCharacterTextSplitter(
+        chunk_size=400,
+        chunk_overlap=150
     )
     chunks = text_splitter.split_documents(documents)
 # Create RAG chain
 def create_chain(vector_store):
     if not os.getenv("HUGGINGFACEHUB_API_TOKEN"):
         return None
+    # llm = HuggingFaceHub(
+    #     repo_id="google/flan-t5-large",
+    #     model_kwargs={"temperature": 0.5, "max_length": 512}
+    # )
     memory = ConversationBufferMemory(
         memory_key="chat_history",
         return_messages=True
     )
+    qa_prompt = PromptTemplate.from_template("""
+    You are a helpful assistant for answering questions about documents.
+    Context information is below.
+    ---------------------
+    {context}
+    ---------------------
+    Given the context information and not prior knowledge, answer the question: {question}
+    If the context is not provided, please respond saying, no context was found
+    """)
     chain = ConversationalRetrievalChain.from_llm(
         llm=llm,
         retriever=vector_store.as_retriever(search_kwargs={"k": 3}),
+        memory=memory,
+        combine_docs_chain_kwargs={"prompt": qa_prompt}
     )
     return chain
 chain = None
 chat_history = []
+# Function to handle file uploads
+import shutil
 def upload_file(files):
+    print("Entered file processing:")
+    print(files)
     try:
+        # Clear existing documents if uploading new ones
+        for f in os.listdir(DOCUMENTS_DIR):
+            file_path = os.path.join(DOCUMENTS_DIR, f)
             if os.path.isfile(file_path):
                 os.remove(file_path)
+        # Process uploaded files
         for file in files:
+            if isinstance(file, str) and os.path.isfile(file):
+                file_name = os.path.basename(file)
+                dest_path = os.path.join(DOCUMENTS_DIR, file_name)
+                shutil.copy(file, dest_path)
+                print(f"Copied {file} to {dest_path}")
+            else:
+                return f"Invalid file format or file not found: {file}"
+        # Process documents and create vector store
         global vector_store, chain
         vector_store = process_documents()
         chain = create_chain(vector_store)
         if chain is None:
             return "Files uploaded and processed, but HuggingFace API token is missing. Set the environment variable to enable the chatbot."
         return "Files uploaded and processed successfully!"
     except Exception as e:
         return f"Error processing files: {str(e)}"
 # Function to handle user queries
 def chat(message, history):
     global chain, chat_history, vector_store
     if vector_store is None:
+        if os.path.exists(DOCUMENTS_DIR) and any(os.path.isfile(os.path.join(DOCUMENTS_DIR, f)) for f in os.listdir(DOCUMENTS_DIR)):
             vector_store = process_documents()
             chain = create_chain(vector_store)
         else:
             return history + [[message, "Please upload documents first to initialize the chatbot."]]
     if chain is None:
         return history + [[message, "HuggingFace API token is not set. Please set the HUGGINGFACE_API_TOKEN environment variable."]]
     try:
         if history:
             chat_history = [(turn[0], turn[1]) for turn in history]
         response = chain({"question": message})
         answer = response['answer']
         return history + [[message, answer]]
     except Exception as e:
         error_message = f"Error processing your request: {str(e)}"
         return history + [[message, error_message]]
+# Create Gradio interface
 with gr.Blocks(title="RAG Chatbot") as demo:
     gr.Markdown("# RAG-based Conversational Chatbot")
     gr.Markdown("Upload text documents and chat with an AI that can answer questions based on their content.")
     with gr.Row():
         with gr.Column(scale=1):
             file_output = gr.Textbox(label="Upload Status")
             file_input = gr.File(
                 file_count="multiple",
                 label="Upload Documents (.txt files)",
+                type="filepath"
             )
             upload_button = gr.Button("Process Documents")
             upload_button.click(upload_file, inputs=[file_input], outputs=[file_output])
 # Launch the app
 if __name__ == "__main__":
+    demo.launch()

requirements.txt CHANGED Viewed

@@ -9,4 +9,7 @@ torch>=2.0.0
 protobuf>=3.20.0
 pydantic>=2.0.0
 accelerate>=0.21.0
-langchain-community

 protobuf>=3.20.0
 pydantic>=2.0.0
 accelerate>=0.21.0
+langchain-community
+python-dotenv
+pypdf
+langchain-openai