avimittal30 commited on
Commit
7b52c77
·
1 Parent(s): a0dc409

code corrected

Browse files
Files changed (2) hide show
  1. app.py +93 -62
  2. requirements.txt +4 -1
app.py CHANGED
@@ -2,27 +2,62 @@ import os
2
  import gradio as gr
3
  import numpy as np
4
  from langchain.text_splitter import RecursiveCharacterTextSplitter
5
- from langchain.document_loaders import DirectoryLoader, TextLoader
6
  from langchain_community.embeddings import HuggingFaceEmbeddings
7
  from langchain_community.vectorstores import FAISS
8
  from langchain.chains import ConversationalRetrievalChain
9
  from langchain.memory import ConversationBufferMemory
 
10
  from langchain_community.llms import HuggingFaceHub
 
 
 
11
 
12
- # Set up environment variables for HuggingFace - safely handle potential None value
 
 
 
13
  huggingface_token = os.getenv("HUGGINGFACE_API_TOKEN")
 
14
  if huggingface_token:
15
  os.environ["HUGGINGFACEHUB_API_TOKEN"] = huggingface_token
16
- else:
17
- print("Warning: HUGGINGFACE_API_TOKEN environment variable not set. You'll need to set it for the LLM to work.")
 
 
 
18
 
19
  # Create a directory for document storage if it doesn't exist
20
- os.makedirs("documents", exist_ok=True)
21
 
22
  # Function to load documents
23
- def load_documents(directory="documents"):
24
- loader = DirectoryLoader(directory, glob="**/*.txt", loader_cls=TextLoader)
25
- documents = loader.load()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
26
  return documents
27
 
28
  # Function to process documents and create vector store
@@ -31,8 +66,8 @@ def process_documents():
31
 
32
  # Split documents into chunks
33
  text_splitter = RecursiveCharacterTextSplitter(
34
- chunk_size=1000,
35
- chunk_overlap=200
36
  )
37
  chunks = text_splitter.split_documents(documents)
38
 
@@ -46,27 +81,36 @@ def process_documents():
46
 
47
  # Create RAG chain
48
  def create_chain(vector_store):
49
- # Check if API token is available
50
  if not os.getenv("HUGGINGFACEHUB_API_TOKEN"):
51
  return None
52
-
53
- # Initialize the LLM
54
- llm = HuggingFaceHub(
55
- repo_id="google/flan-t5-large",
56
- model_kwargs={"temperature": 0.5, "max_length": 512}
57
- )
58
 
59
- # Create memory for the conversation
60
  memory = ConversationBufferMemory(
61
  memory_key="chat_history",
62
  return_messages=True
63
  )
 
 
 
 
 
 
 
 
 
 
 
 
64
 
65
- # Create the conversational chain
66
  chain = ConversationalRetrievalChain.from_llm(
67
  llm=llm,
68
  retriever=vector_store.as_retriever(search_kwargs={"k": 3}),
69
- memory=memory
 
70
  )
71
 
72
  return chain
@@ -76,96 +120,83 @@ vector_store = None
76
  chain = None
77
  chat_history = []
78
 
79
- # Function to handle file uploads - FIXED to handle Gradio's file objects properly
 
 
80
  def upload_file(files):
 
 
81
  try:
82
- # Clear existing documents if we're uploading new ones
83
- for f in os.listdir("documents"):
84
- file_path = os.path.join("documents", f)
85
  if os.path.isfile(file_path):
86
  os.remove(file_path)
87
-
88
- # Handle the uploaded files
89
  for file in files:
90
- # Get file content and name
91
- if hasattr(file, "name"): # For standard file objects
92
- file_name = os.path.basename(file.name)
93
- if hasattr(file, "read"):
94
- content = file.read()
95
- else: # For NamedString objects
96
- content = file.decode('utf-8') if isinstance(file, bytes) else str(file)
97
- else: # For tuple format (file_name, file_data)
98
- if isinstance(file, tuple) and len(file) >= 2:
99
- file_name = os.path.basename(file[0])
100
- content = file[1]
101
- else:
102
- # If none of the above, try to handle as string with a default name
103
- file_name = f"document_{len(os.listdir('documents'))}.txt"
104
- content = str(file)
105
-
106
- # Write content to file
107
- file_path = os.path.join("documents", file_name)
108
- with open(file_path, "w", encoding='utf-8') as f:
109
- f.write(content if isinstance(content, str) else content.decode('utf-8'))
110
-
111
  global vector_store, chain
112
  vector_store = process_documents()
113
  chain = create_chain(vector_store)
114
-
115
  if chain is None:
116
  return "Files uploaded and processed, but HuggingFace API token is missing. Set the environment variable to enable the chatbot."
117
-
118
  return "Files uploaded and processed successfully!"
 
119
  except Exception as e:
120
  return f"Error processing files: {str(e)}"
121
 
 
122
  # Function to handle user queries
123
  def chat(message, history):
124
  global chain, chat_history, vector_store
125
 
126
- # Check if documents exist
127
  if vector_store is None:
128
- if os.path.exists("documents") and any(os.path.isfile(os.path.join("documents", f)) for f in os.listdir("documents")):
129
  vector_store = process_documents()
130
  chain = create_chain(vector_store)
131
  else:
132
- # Return in the format expected by Gradio chatbot
133
  return history + [[message, "Please upload documents first to initialize the chatbot."]]
134
 
135
- # Check if API token is set
136
  if chain is None:
137
- # Return in the format expected by Gradio chatbot
138
  return history + [[message, "HuggingFace API token is not set. Please set the HUGGINGFACE_API_TOKEN environment variable."]]
139
 
140
- # Process the message with the chain
141
  try:
142
- # Convert history to format expected by chain
143
  if history:
144
  chat_history = [(turn[0], turn[1]) for turn in history]
145
 
146
- # Get response from chain
147
  response = chain({"question": message})
148
  answer = response['answer']
149
 
150
- # Return in the format expected by Gradio chatbot
151
  return history + [[message, answer]]
152
  except Exception as e:
153
- # Handle any errors
154
  error_message = f"Error processing your request: {str(e)}"
155
  return history + [[message, error_message]]
156
 
157
- # Create Gradio interface - UPDATED to use a simpler file upload approach
158
  with gr.Blocks(title="RAG Chatbot") as demo:
 
159
  gr.Markdown("# RAG-based Conversational Chatbot")
160
  gr.Markdown("Upload text documents and chat with an AI that can answer questions based on their content.")
161
 
162
  with gr.Row():
163
  with gr.Column(scale=1):
164
  file_output = gr.Textbox(label="Upload Status")
 
165
  file_input = gr.File(
166
  file_count="multiple",
167
  label="Upload Documents (.txt files)",
168
- type="text" # Specifying text type for proper handling
169
  )
170
  upload_button = gr.Button("Process Documents")
171
  upload_button.click(upload_file, inputs=[file_input], outputs=[file_output])
@@ -180,4 +211,4 @@ with gr.Blocks(title="RAG Chatbot") as demo:
180
 
181
  # Launch the app
182
  if __name__ == "__main__":
183
- demo.launch()
 
2
  import gradio as gr
3
  import numpy as np
4
  from langchain.text_splitter import RecursiveCharacterTextSplitter
5
+ from langchain.document_loaders import DirectoryLoader, PyPDFLoader
6
  from langchain_community.embeddings import HuggingFaceEmbeddings
7
  from langchain_community.vectorstores import FAISS
8
  from langchain.chains import ConversationalRetrievalChain
9
  from langchain.memory import ConversationBufferMemory
10
+ from langchain.prompts import PromptTemplate
11
  from langchain_community.llms import HuggingFaceHub
12
+ from dotenv import load_dotenv
13
+ from langchain_openai import ChatOpenAI
14
+ import shutil
15
 
16
+ # Define directory variable
17
+ load_dotenv(dotenv_path=os.path.join(os.getcwd(), '.env'))
18
+ DOCUMENTS_DIR = "documents"
19
+ # Set up environment variables for HuggingFace
20
  huggingface_token = os.getenv("HUGGINGFACE_API_TOKEN")
21
+ os.environ["OPENAI_API_KEY"] = os.getenv('OPENAI_API_KEY')
22
  if huggingface_token:
23
  os.environ["HUGGINGFACEHUB_API_TOKEN"] = huggingface_token
24
+
25
+ # # Remove the existing documents directory if it exists
26
+ # if os.path.exists(DOCUMENTS_DIR):
27
+ # shutil.rmtree(DOCUMENTS_DIR)
28
+ llm = ChatOpenAI(temperature=0.7, model_name="gpt-3.5-turbo")
29
 
30
  # Create a directory for document storage if it doesn't exist
31
+ os.makedirs(DOCUMENTS_DIR, exist_ok=True)
32
 
33
  # Function to load documents
34
+ def load_documents(directory=DOCUMENTS_DIR):
35
+ print("Entered load documents")
36
+ documents = []
37
+
38
+ # Find all PDF files
39
+ pdf_files = []
40
+ for root, _, files in os.walk(directory):
41
+ for file in files:
42
+ if file.lower().endswith('.pdf'):
43
+ pdf_files.append(os.path.join(root, file))
44
+
45
+ print(f"Found {len(pdf_files)} PDF files")
46
+
47
+ # Process each PDF with error handling
48
+ for pdf_path in pdf_files:
49
+ try:
50
+
51
+ print(f"Processing {pdf_path}")
52
+ loader = PyPDFLoader(pdf_path)
53
+
54
+ file_documents = loader.load()
55
+ documents.extend(file_documents)
56
+ print(f"Successfully loaded {pdf_path}")
57
+ except Exception as e:
58
+ print(f"Failed to load {pdf_path}: {str(e)}")
59
+
60
+ print(f"Successfully loaded {len(documents)} documents")
61
  return documents
62
 
63
  # Function to process documents and create vector store
 
66
 
67
  # Split documents into chunks
68
  text_splitter = RecursiveCharacterTextSplitter(
69
+ chunk_size=400,
70
+ chunk_overlap=150
71
  )
72
  chunks = text_splitter.split_documents(documents)
73
 
 
81
 
82
  # Create RAG chain
83
  def create_chain(vector_store):
 
84
  if not os.getenv("HUGGINGFACEHUB_API_TOKEN"):
85
  return None
86
+
87
+ # llm = HuggingFaceHub(
88
+ # repo_id="google/flan-t5-large",
89
+ # model_kwargs={"temperature": 0.5, "max_length": 512}
90
+ # )
 
91
 
 
92
  memory = ConversationBufferMemory(
93
  memory_key="chat_history",
94
  return_messages=True
95
  )
96
+
97
+ qa_prompt = PromptTemplate.from_template("""
98
+ You are a helpful assistant for answering questions about documents.
99
+
100
+ Context information is below.
101
+ ---------------------
102
+ {context}
103
+ ---------------------
104
+ Given the context information and not prior knowledge, answer the question: {question}
105
+ If the context is not provided, please respond saying, no context was found
106
+
107
+ """)
108
 
 
109
  chain = ConversationalRetrievalChain.from_llm(
110
  llm=llm,
111
  retriever=vector_store.as_retriever(search_kwargs={"k": 3}),
112
+ memory=memory,
113
+ combine_docs_chain_kwargs={"prompt": qa_prompt}
114
  )
115
 
116
  return chain
 
120
  chain = None
121
  chat_history = []
122
 
123
+ # Function to handle file uploads
124
+ import shutil
125
+
126
  def upload_file(files):
127
+ print("Entered file processing:")
128
+ print(files)
129
  try:
130
+ # Clear existing documents if uploading new ones
131
+ for f in os.listdir(DOCUMENTS_DIR):
132
+ file_path = os.path.join(DOCUMENTS_DIR, f)
133
  if os.path.isfile(file_path):
134
  os.remove(file_path)
135
+
136
+ # Process uploaded files
137
  for file in files:
138
+ if isinstance(file, str) and os.path.isfile(file):
139
+ file_name = os.path.basename(file)
140
+ dest_path = os.path.join(DOCUMENTS_DIR, file_name)
141
+ shutil.copy(file, dest_path)
142
+ print(f"Copied {file} to {dest_path}")
143
+ else:
144
+ return f"Invalid file format or file not found: {file}"
145
+
146
+ # Process documents and create vector store
 
 
 
 
 
 
 
 
 
 
 
 
147
  global vector_store, chain
148
  vector_store = process_documents()
149
  chain = create_chain(vector_store)
150
+
151
  if chain is None:
152
  return "Files uploaded and processed, but HuggingFace API token is missing. Set the environment variable to enable the chatbot."
153
+
154
  return "Files uploaded and processed successfully!"
155
+
156
  except Exception as e:
157
  return f"Error processing files: {str(e)}"
158
 
159
+
160
  # Function to handle user queries
161
  def chat(message, history):
162
  global chain, chat_history, vector_store
163
 
 
164
  if vector_store is None:
165
+ if os.path.exists(DOCUMENTS_DIR) and any(os.path.isfile(os.path.join(DOCUMENTS_DIR, f)) for f in os.listdir(DOCUMENTS_DIR)):
166
  vector_store = process_documents()
167
  chain = create_chain(vector_store)
168
  else:
 
169
  return history + [[message, "Please upload documents first to initialize the chatbot."]]
170
 
 
171
  if chain is None:
 
172
  return history + [[message, "HuggingFace API token is not set. Please set the HUGGINGFACE_API_TOKEN environment variable."]]
173
 
 
174
  try:
 
175
  if history:
176
  chat_history = [(turn[0], turn[1]) for turn in history]
177
 
 
178
  response = chain({"question": message})
179
  answer = response['answer']
180
 
 
181
  return history + [[message, answer]]
182
  except Exception as e:
 
183
  error_message = f"Error processing your request: {str(e)}"
184
  return history + [[message, error_message]]
185
 
186
+ # Create Gradio interface
187
  with gr.Blocks(title="RAG Chatbot") as demo:
188
+
189
  gr.Markdown("# RAG-based Conversational Chatbot")
190
  gr.Markdown("Upload text documents and chat with an AI that can answer questions based on their content.")
191
 
192
  with gr.Row():
193
  with gr.Column(scale=1):
194
  file_output = gr.Textbox(label="Upload Status")
195
+
196
  file_input = gr.File(
197
  file_count="multiple",
198
  label="Upload Documents (.txt files)",
199
+ type="filepath"
200
  )
201
  upload_button = gr.Button("Process Documents")
202
  upload_button.click(upload_file, inputs=[file_input], outputs=[file_output])
 
211
 
212
  # Launch the app
213
  if __name__ == "__main__":
214
+ demo.launch()
requirements.txt CHANGED
@@ -9,4 +9,7 @@ torch>=2.0.0
9
  protobuf>=3.20.0
10
  pydantic>=2.0.0
11
  accelerate>=0.21.0
12
- langchain-community
 
 
 
 
9
  protobuf>=3.20.0
10
  pydantic>=2.0.0
11
  accelerate>=0.21.0
12
+ langchain-community
13
+ python-dotenv
14
+ pypdf
15
+ langchain-openai