Shreyas094 commited on
Commit
82cd6c2
·
verified ·
1 Parent(s): 6f299e9

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +130 -56
app.py CHANGED
@@ -5,17 +5,40 @@ from langchain.embeddings import HuggingFaceInferenceAPIEmbeddings
5
  from langchain.vectorstores import FAISS
6
  from huggingface_hub import InferenceClient
7
  import os
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8
 
9
  class RAGApplication:
10
  def __init__(self, hf_api_key):
11
- self.hf_api_key = hf_api_key
12
- self.vector_store = None
13
- self.embeddings = HuggingFaceInferenceAPIEmbeddings(
14
- api_key=hf_api_key,
15
- model_name="sentence-transformers/all-MiniLM-L6-v2"
16
- )
17
- self.client = InferenceClient(api_key=hf_api_key)
18
- self.conversation_history = []
 
 
 
 
 
 
 
 
 
19
  self.system_prompt = """You are a precise and accurate PDF summarization assistant. Your role is to:
20
  1. Provide accurate answers based solely on the provided context
21
  2. Maintain factual consistency and never hallucinate information
@@ -40,61 +63,88 @@ Answer:"""
40
 
41
  def process_pdf(self, file_path):
42
  try:
 
 
43
  if file_path is None:
 
44
  return "Please upload a PDF file."
45
 
46
  if not os.path.exists(file_path):
 
47
  return f"File not found: {file_path}"
48
 
49
  # Reset conversation history when new PDF is loaded
50
  self.conversation_history = []
 
51
 
52
  # Read PDF directly from the file path
 
53
  pdf_reader = PdfReader(file_path)
54
  text = ""
55
- for page in pdf_reader.pages:
56
- text += page.extract_text()
 
 
 
 
57
 
58
  if not text.strip():
 
59
  return "No text could be extracted from the PDF. Please make sure it's not empty or scanned."
60
 
61
  # Split text into chunks
 
62
  text_splitter = RecursiveCharacterTextSplitter(
63
  chunk_size=1000,
64
  chunk_overlap=200,
65
  length_function=len
66
  )
67
  chunks = text_splitter.split_text(text)
 
68
 
69
  if not chunks:
 
70
  return "No chunks were created. The PDF might be empty."
71
 
72
  # Create vector store
 
73
  self.vector_store = FAISS.from_texts(chunks, self.embeddings)
 
74
 
75
  return "PDF processed successfully! You can now ask questions about it."
76
  except Exception as e:
77
- return f"Error processing PDF: {str(e)}"
 
 
 
78
 
79
  def generate_response(self, message, history):
80
  try:
 
 
81
  if self.vector_store is None:
 
82
  return "Please upload and process a PDF first."
83
 
84
  query = message.strip()
85
  if not query:
 
86
  return "Please enter a question."
87
 
88
  # Search for relevant chunks
 
89
  relevant_chunks = self.vector_store.similarity_search(query, k=3)
90
  context = "\n\n".join([doc.page_content for doc in relevant_chunks])
 
91
 
92
  # Format conversation history
 
93
  conversation_history = "\n".join([
94
- f"Q: {q}\nA: {a}" for q, a in history[-3:] if q and a # Keep last 3 exchanges
95
  ])
96
 
97
  # Create prompt with system prompt, context, and conversation history
 
98
  prompt = self.system_prompt.format(
99
  context=context,
100
  conversation_history=conversation_history,
@@ -102,59 +152,83 @@ Answer:"""
102
  )
103
 
104
  # Generate response using Mistral
 
105
  response = ""
106
- for message in self.client.chat_completion(
107
- model="mistralai/Mistral-Nemo-Instruct-2407",
108
- messages=[
109
- {"role": "system", "content": prompt},
110
- {"role": "user", "content": query}
111
- ],
112
- max_tokens=500,
113
- stream=True,
114
- ):
115
- response += message.choices[0].delta.content
 
 
 
 
 
116
 
117
  return response
118
  except Exception as e:
119
- return f"Error generating response: {str(e)}"
 
 
 
120
 
121
  # Create Gradio interface
122
  def create_gradio_interface():
123
- # You should never hardcode API keys - use environment variables in production
124
- rag = RAGApplication(hf_api_key="your_huggingface_api_key")
125
-
126
- with gr.Blocks() as demo:
127
- gr.Markdown("# PDF Question Answering System")
128
 
129
- with gr.Row():
130
- pdf_input = gr.File(
131
- label="Upload PDF",
132
- file_types=[".pdf"],
133
- type="filepath"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
134
  )
135
- process_button = gr.Button("Process PDF")
136
- status_output = gr.Textbox(label="Status", interactive=False)
137
 
138
- process_button.click(
139
- fn=rag.process_pdf,
140
- inputs=[pdf_input],
141
- outputs=[status_output]
142
- )
143
-
144
- chat_interface = gr.ChatInterface(
145
- fn=rag.generate_response,
146
- title="Chat with your PDF",
147
- description="Upload a PDF and ask questions about its contents.",
148
- theme="soft",
149
- examples=[
150
- "What is the main topic of this document?",
151
- "Can you summarize the key points?",
152
- "What are the main conclusions?",
153
- ],
154
- )
155
-
156
- return demo
157
 
158
  if __name__ == "__main__":
159
- demo = create_gradio_interface()
160
- demo.launch()
 
 
 
 
 
 
 
 
5
  from langchain.vectorstores import FAISS
6
  from huggingface_hub import InferenceClient
7
  import os
8
+ import logging
9
+ import traceback
10
+ from datetime import datetime
11
+
12
+ # Configure logging
13
+ logging.basicConfig(
14
+ level=logging.DEBUG,
15
+ format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
16
+ handlers=[
17
+ logging.FileHandler(f'rag_app_{datetime.now().strftime("%Y%m%d_%H%M%S")}.log'),
18
+ logging.StreamHandler()
19
+ ]
20
+ )
21
+ logger = logging.getLogger(__name__)
22
 
23
  class RAGApplication:
24
  def __init__(self, hf_api_key):
25
+ try:
26
+ self.hf_api_key = hf_api_key
27
+ self.vector_store = None
28
+ logger.info("Initializing HuggingFace embeddings...")
29
+ self.embeddings = HuggingFaceInferenceAPIEmbeddings(
30
+ api_key=hf_api_key,
31
+ model_name="sentence-transformers/all-MiniLM-L6-v2"
32
+ )
33
+ logger.info("Initializing HuggingFace client...")
34
+ self.client = InferenceClient(api_key=hf_api_key)
35
+ self.conversation_history = []
36
+ logger.info("RAGApplication initialized successfully")
37
+ except Exception as e:
38
+ logger.error(f"Error initializing RAGApplication: {str(e)}")
39
+ logger.error(f"Traceback: {traceback.format_exc()}")
40
+ raise
41
+
42
  self.system_prompt = """You are a precise and accurate PDF summarization assistant. Your role is to:
43
  1. Provide accurate answers based solely on the provided context
44
  2. Maintain factual consistency and never hallucinate information
 
63
 
64
  def process_pdf(self, file_path):
65
  try:
66
+ logger.info(f"Starting PDF processing for file: {file_path}")
67
+
68
  if file_path is None:
69
+ logger.warning("No file provided")
70
  return "Please upload a PDF file."
71
 
72
  if not os.path.exists(file_path):
73
+ logger.error(f"File not found at path: {file_path}")
74
  return f"File not found: {file_path}"
75
 
76
  # Reset conversation history when new PDF is loaded
77
  self.conversation_history = []
78
+ logger.info("Conversation history reset")
79
 
80
  # Read PDF directly from the file path
81
+ logger.info("Reading PDF file...")
82
  pdf_reader = PdfReader(file_path)
83
  text = ""
84
+ for i, page in enumerate(pdf_reader.pages):
85
+ try:
86
+ text += page.extract_text()
87
+ logger.debug(f"Extracted text from page {i+1}")
88
+ except Exception as e:
89
+ logger.error(f"Error extracting text from page {i+1}: {str(e)}")
90
 
91
  if not text.strip():
92
+ logger.warning("No text extracted from PDF")
93
  return "No text could be extracted from the PDF. Please make sure it's not empty or scanned."
94
 
95
  # Split text into chunks
96
+ logger.info("Splitting text into chunks...")
97
  text_splitter = RecursiveCharacterTextSplitter(
98
  chunk_size=1000,
99
  chunk_overlap=200,
100
  length_function=len
101
  )
102
  chunks = text_splitter.split_text(text)
103
+ logger.info(f"Created {len(chunks)} chunks")
104
 
105
  if not chunks:
106
+ logger.warning("No chunks created from text")
107
  return "No chunks were created. The PDF might be empty."
108
 
109
  # Create vector store
110
+ logger.info("Creating vector store...")
111
  self.vector_store = FAISS.from_texts(chunks, self.embeddings)
112
+ logger.info("Vector store created successfully")
113
 
114
  return "PDF processed successfully! You can now ask questions about it."
115
  except Exception as e:
116
+ error_msg = f"Error processing PDF: {str(e)}"
117
+ logger.error(error_msg)
118
+ logger.error(f"Traceback: {traceback.format_exc()}")
119
+ return error_msg
120
 
121
  def generate_response(self, message, history):
122
  try:
123
+ logger.info(f"Generating response for message: {message}")
124
+
125
  if self.vector_store is None:
126
+ logger.warning("No vector store available - PDF not processed")
127
  return "Please upload and process a PDF first."
128
 
129
  query = message.strip()
130
  if not query:
131
+ logger.warning("Empty query received")
132
  return "Please enter a question."
133
 
134
  # Search for relevant chunks
135
+ logger.info("Searching for relevant chunks...")
136
  relevant_chunks = self.vector_store.similarity_search(query, k=3)
137
  context = "\n\n".join([doc.page_content for doc in relevant_chunks])
138
+ logger.debug(f"Found {len(relevant_chunks)} relevant chunks")
139
 
140
  # Format conversation history
141
+ logger.debug(f"Processing conversation history (length: {len(history)})")
142
  conversation_history = "\n".join([
143
+ f"Q: {q}\nA: {a}" for q, a in history[-3:] if q and a
144
  ])
145
 
146
  # Create prompt with system prompt, context, and conversation history
147
+ logger.debug("Creating prompt...")
148
  prompt = self.system_prompt.format(
149
  context=context,
150
  conversation_history=conversation_history,
 
152
  )
153
 
154
  # Generate response using Mistral
155
+ logger.info("Generating response using Mistral...")
156
  response = ""
157
+ try:
158
+ for message in self.client.chat_completion(
159
+ model="mistralai/Mistral-Nemo-Instruct-2407",
160
+ messages=[
161
+ {"role": "system", "content": prompt},
162
+ {"role": "user", "content": query}
163
+ ],
164
+ max_tokens=500,
165
+ stream=True,
166
+ ):
167
+ response += message.choices[0].delta.content
168
+ logger.info("Response generated successfully")
169
+ except Exception as e:
170
+ logger.error(f"Error in chat completion: {str(e)}")
171
+ raise
172
 
173
  return response
174
  except Exception as e:
175
+ error_msg = f"Error generating response: {str(e)}"
176
+ logger.error(error_msg)
177
+ logger.error(f"Traceback: {traceback.format_exc()}")
178
+ return error_msg
179
 
180
  # Create Gradio interface
181
  def create_gradio_interface():
182
+ try:
183
+ logger.info("Creating Gradio interface...")
184
+ # You should never hardcode API keys - use environment variables in production
185
+ api_key = os.getenv("HUGGINGFACE_API_KEY", "your_huggingface_api_key")
186
+ rag = RAGApplication(hf_api_key=api_key)
187
 
188
+ with gr.Blocks() as demo:
189
+ gr.Markdown("# PDF Question Answering System")
190
+
191
+ with gr.Row():
192
+ pdf_input = gr.File(
193
+ label="Upload PDF",
194
+ file_types=[".pdf"],
195
+ type="filepath"
196
+ )
197
+ process_button = gr.Button("Process PDF")
198
+ status_output = gr.Textbox(label="Status", interactive=False)
199
+
200
+ process_button.click(
201
+ fn=rag.process_pdf,
202
+ inputs=[pdf_input],
203
+ outputs=[status_output]
204
+ )
205
+
206
+ chat_interface = gr.ChatInterface(
207
+ fn=rag.generate_response,
208
+ title="Chat with your PDF",
209
+ description="Upload a PDF and ask questions about its contents.",
210
+ theme="soft",
211
+ examples=[
212
+ "What is the main topic of this document?",
213
+ "Can you summarize the key points?",
214
+ "What are the main conclusions?",
215
+ ],
216
  )
 
 
217
 
218
+ logger.info("Gradio interface created successfully")
219
+ return demo
220
+ except Exception as e:
221
+ logger.error(f"Error creating Gradio interface: {str(e)}")
222
+ logger.error(f"Traceback: {traceback.format_exc()}")
223
+ raise
 
 
 
 
 
 
 
 
 
 
 
 
 
224
 
225
  if __name__ == "__main__":
226
+ try:
227
+ logger.info("Starting application...")
228
+ demo = create_gradio_interface()
229
+ logger.info("Launching Gradio interface...")
230
+ demo.launch()
231
+ except Exception as e:
232
+ logger.error(f"Application failed to start: {str(e)}")
233
+ logger.error(f"Traceback: {traceback.format_exc()}")
234
+ raise