Shreyas094 commited on
Commit
73a7410
·
verified ·
1 Parent(s): 1471f55

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +197 -262
app.py CHANGED
@@ -1,283 +1,218 @@
1
- import gradio as gr
2
- from PyPDF2 import PdfReader
3
- from langchain.embeddings import HuggingFaceInferenceAPIEmbeddings
4
- from langchain.vectorstores import FAISS
5
- from huggingface_hub import InferenceClient
6
  import os
 
7
  import logging
8
- import traceback
9
- from datetime import datetime
10
- from typing import List, Dict, Tuple, Any
11
- import re
12
-
13
- # Configure logging
14
- logging.basicConfig(
15
- level=logging.DEBUG,
16
- format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
17
- handlers=[
18
- logging.FileHandler(f'page_rag_{datetime.now().strftime("%Y%m%d_%H%M%S")}.log'),
19
- logging.StreamHandler()
20
- ]
21
- )
22
- logger = logging.getLogger(__name__)
23
-
24
- class TextPreprocessor:
25
- @staticmethod
26
- def clean_text(text: str) -> str:
27
- """Clean and normalize text content."""
28
- # Remove multiple spaces
29
- text = re.sub(r'\s+', ' ', text)
30
- # Remove multiple newlines
31
- text = re.sub(r'\n\s*\n', '\n\n', text)
32
- # Normalize quotes
33
- text = re.sub(r'["\'""]', '"', text)
34
- # Remove header/footer artifacts
35
- text = re.sub(r'^.*Page \d+.*$', '', text, flags=re.MULTILINE)
36
- return text.strip()
37
-
38
- @staticmethod
39
- def extract_section_headers(text: str) -> List[str]:
40
- """Extract potential section headers from text."""
41
- # Simple header detection (can be enhanced based on document structure)
42
- header_pattern = r'^(?:[A-Z][A-Za-z\s]{2,50}:?|(?:\d+\.){1,3}\s+[A-Z][A-Za-z\s]{2,50})$'
43
- headers = re.findall(header_pattern, text, re.MULTILINE)
44
- return headers
45
-
46
- def create_page_chunks(pdf_reader: PdfReader) -> List[Dict[str, Any]]:
47
- """
48
- Creates page-level chunks from PDF content.
49
- """
50
- page_chunks = []
51
- preprocessor = TextPreprocessor()
52
 
53
- for page_num, page in enumerate(pdf_reader.pages, 1):
 
 
 
 
 
54
  try:
55
- page_text = page.extract_text()
56
- if not page_text.strip():
57
  continue
58
 
59
- # Clean and preprocess text
60
- cleaned_text = preprocessor.clean_text(page_text)
61
- headers = preprocessor.extract_section_headers(cleaned_text)
62
 
63
- # Store full page as a chunk
64
- page_chunks.append({
65
- "content": cleaned_text,
66
- "metadata": {
67
- "page_num": page_num,
68
- "section_headers": headers
69
- }
70
- })
71
 
72
  except Exception as e:
73
- logger.error(f"Error processing page {page_num}: {str(e)}")
74
- continue
75
-
76
- return page_chunks
77
-
78
- class RAGApplication:
79
- def __init__(self, hf_api_key: str):
80
- try:
81
- self.hf_api_key = hf_api_key
82
- self.vector_store = None
83
-
84
- logger.info("Initializing HuggingFace embeddings...")
85
- self.embeddings = HuggingFaceInferenceAPIEmbeddings(
86
- api_key=hf_api_key,
87
- model_name="sentence-transformers/all-MiniLM-L6-v2"
88
- )
89
-
90
- logger.info("Initializing HuggingFace client...")
91
- self.client = InferenceClient(api_key=hf_api_key)
92
- self.conversation_history = []
93
-
94
- # Initialize cache
95
- self.query_cache = {}
96
-
97
- logger.info("RAGApplication initialized successfully")
98
- except Exception as e:
99
- logger.error(f"Error initializing RAGApplication: {str(e)}")
100
- logger.error(f"Traceback: {traceback.format_exc()}")
101
- raise
102
-
103
- self.system_prompt = """You are a precise and accurate PDF summarization assistant. Your role is to:
104
- 1. Provide accurate answers based solely on the provided context
105
- 2. Maintain factual consistency and never hallucinate information
106
- 3. Clearly indicate when information is not available in the context
107
- 4. Use concise language and avoid unnecessary elaboration
108
- 5. Maintain continuity with previous conversation when relevant
109
-
110
- Context: {context}
111
-
112
- Previous conversation:
113
- {conversation_history}
114
-
115
- Question: {question}
116
-
117
- Answer:"""
118
-
119
- def process_pdf(self, file_path: str) -> str:
120
- try:
121
- logger.info(f"Starting PDF processing for file: {file_path}")
122
-
123
- if file_path is None or not os.path.exists(file_path):
124
- return "Please upload a valid PDF file."
125
-
126
- # Reset conversation history and cache
127
- self.conversation_history = []
128
- self.query_cache = {}
129
-
130
- pdf_reader = PdfReader(file_path)
131
-
132
- # Create page chunks
133
- page_chunks = create_page_chunks(pdf_reader)
134
-
135
- # Create vector store
136
- logger.info("Creating vector store...")
137
- self.vector_store = FAISS.from_texts(
138
- [chunk["content"] for chunk in page_chunks],
139
- self.embeddings,
140
- metadatas=[chunk["metadata"] for chunk in page_chunks]
141
- )
142
-
143
- logger.info("Vector store created successfully")
144
- return "PDF processed successfully!"
145
-
146
- except Exception as e:
147
- logger.error(f"Error in PDF processing: {str(e)}")
148
- return f"Error processing PDF: {str(e)}"
149
-
150
- def retrieve_context(self, query: str, k: int = 3) -> str:
151
- """
152
- Retrieve relevant pages for the given query.
153
- """
154
- # Check query cache
155
- cache_key = f"{query}_{k}"
156
- if cache_key in self.query_cache:
157
- return self.query_cache[cache_key]
158
-
159
- # Get relevant pages
160
- results = self.vector_store.similarity_search_with_score(query, k=k)
161
 
162
- # Combine context while preserving document structure
163
- context = []
164
- for doc, score in results:
165
- context_str = f"[Page {doc.metadata['page_num']}"
166
-
167
- if doc.metadata.get('section_headers'):
168
- context_str += f", Section: {doc.metadata['section_headers'][0]}"
169
-
170
- context_str += f"]: {doc.page_content}"
171
- context.append(context_str)
172
-
173
- final_context = "\n\n".join(context)
174
 
175
- # Cache the result
176
- self.query_cache[cache_key] = final_context
177
- return final_context
178
-
179
- def generate_response(self, message: str, history: List[Tuple[str, str]]) -> str:
180
- try:
181
- logger.info(f"Generating response for message: {message}")
182
-
183
- if not self.vector_store:
184
- return "Please upload and process a PDF first."
185
-
186
- query = message.strip()
187
- if not query:
188
- return "Please enter a question."
189
-
190
- # Get relevant context
191
- context = self.retrieve_context(query)
192
-
193
- # Format conversation history
194
- conversation_history = "\n".join([
195
- f"Q: {q}\nA: {a}" for q, a in history[-3:] if q and a
196
- ])
197
-
198
- # Create prompt
199
- prompt = self.system_prompt.format(
200
- context=context,
201
- conversation_history=conversation_history,
202
- question=query
203
- )
204
-
205
- # Generate response using Mistral
206
- logger.info("Generating response using Mistral...")
207
- response = ""
208
- try:
209
- for message in self.client.chat_completion(
210
- model="mistralai/Mistral-Nemo-Instruct-2407",
211
- messages=[
212
- {"role": "system", "content": prompt},
213
- {"role": "user", "content": query}
214
- ],
215
- max_tokens=10000,
216
- stream=True,
217
- ):
218
- response += message.choices[0].delta.content
219
- logger.info("Response generated successfully")
220
- except Exception as e:
221
- logger.error(f"Error in chat completion: {str(e)}")
222
- raise
223
 
224
- return response
225
- except Exception as e:
226
- error_msg = f"Error generating response: {str(e)}"
227
- logger.error(error_msg)
228
- logger.error(f"Traceback: {traceback.format_exc()}")
229
- return error_msg
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
230
 
231
- def create_gradio_interface():
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
232
  try:
233
- logger.info("Creating Gradio interface...")
234
- api_key = os.getenv("HF_API_KEY")
235
- rag = RAGApplication(hf_api_key=api_key)
 
 
 
 
 
 
 
 
 
 
236
 
237
- with gr.Blocks() as demo:
238
- gr.Markdown("# PDF Question Answering System")
239
-
240
- with gr.Row():
241
- pdf_input = gr.File(
242
- label="Upload PDF",
243
  file_types=[".pdf"],
244
- type="filepath"
245
  )
246
- process_button = gr.Button("Process PDF")
247
- status_output = gr.Textbox(label="Status", interactive=False)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
248
 
249
- process_button.click(
250
- fn=rag.process_pdf,
251
- inputs=[pdf_input],
252
- outputs=[status_output]
253
- )
254
-
255
- chat_interface = gr.ChatInterface(
256
- fn=rag.generate_response,
257
- title="Chat with your PDF",
258
- description="Upload a PDF and ask questions about its contents.",
259
- theme="soft",
260
- examples=[
261
- "What is the main topic of this document?",
262
- "Can you summarize the key points?",
263
- "What are the main conclusions?",
264
- ],
265
- )
266
 
267
- logger.info("Gradio interface created successfully")
268
- return demo
269
- except Exception as e:
270
- logger.error(f"Error creating Gradio interface: {str(e)}")
271
- logger.error(f"Traceback: {traceback.format_exc()}")
272
- raise
 
 
 
 
 
 
 
 
 
 
 
 
 
 
273
 
274
  if __name__ == "__main__":
275
- try:
276
- logger.info("Starting application...")
277
- demo = create_gradio_interface()
278
- logger.info("Launching Gradio interface...")
279
- demo.launch()
280
- except Exception as e:
281
- logger.error(f"Application failed to start: {str(e)}")
282
- logger.error(f"Traceback: {traceback.format_exc()}")
283
- raise
 
 
 
 
 
 
1
  import os
2
+ import json
3
  import logging
4
+ import shutil
5
+ import gradio as gr
6
+ from typing import List
7
+ from tempfile import NamedTemporaryFile
8
+ from huggingface_hub import InferenceClient
9
+ from langchain_community.document_loaders import PyPDFLoader
10
+ from langchain_community.embeddings import HuggingFaceEmbeddings
11
+ from langchain_community.vectorstores import FAISS
12
+ from langchain.docstore.document import Document
13
+
14
+ # Setup logging
15
+ logging.basicConfig(level=logging.INFO)
16
+
17
+ # Constants
18
+ DOCUMENTS_FILE = "uploaded_documents.json"
19
+ DEFAULT_MODEL = "@cf/meta/llama-2-7b-chat"
20
+ HF_TOKEN = os.getenv("HF_API_TOKEN") # Make sure to set this environment variable
21
+
22
+ def get_embeddings():
23
+ return HuggingFaceEmbeddings(model_name="avsolatorio/GIST-Embedding-v0")
24
+
25
+ def load_documents():
26
+ if os.path.exists(DOCUMENTS_FILE):
27
+ with open(DOCUMENTS_FILE, "r") as f:
28
+ return json.load(f)
29
+ return []
30
+
31
+ def save_documents(documents):
32
+ with open(DOCUMENTS_FILE, "w") as f:
33
+ json.dump(documents, f)
34
+
35
+ def load_document(file: NamedTemporaryFile) -> List[Document]:
36
+ """Loads and splits the document into pages using PyPDF."""
37
+ loader = PyPDFLoader(file.name)
38
+ return loader.load_and_split()
39
+
40
+ def update_vectors(files):
41
+ if not files:
42
+ return "Please upload at least one file.", []
 
 
 
 
 
43
 
44
+ embed = get_embeddings()
45
+ uploaded_documents = load_documents()
46
+ total_chunks = 0
47
+
48
+ all_data = []
49
+ for file in files:
50
  try:
51
+ data = load_document(file)
52
+ if not data:
53
  continue
54
 
55
+ all_data.extend(data)
56
+ total_chunks += len(data)
 
57
 
58
+ if not any(doc["name"] == file.name for doc in uploaded_documents):
59
+ uploaded_documents.append({"name": file.name, "selected": True})
 
 
 
 
 
 
60
 
61
  except Exception as e:
62
+ logging.error(f"Error processing file {file.name}: {str(e)}")
63
+
64
+ if not all_data:
65
+ return "No valid data could be extracted from the uploaded files.", []
66
+
67
+ try:
68
+ if os.path.exists("faiss_database"):
69
+ database = FAISS.load_local("faiss_database", embed, allow_dangerous_deserialization=True)
70
+ database.add_documents(all_data)
71
+ else:
72
+ database = FAISS.from_documents(all_data, embed)
73
+ database.save_local("faiss_database")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
74
 
75
+ save_documents(uploaded_documents)
76
+ return f"Vector store updated successfully. Processed {total_chunks} chunks.", uploaded_documents
 
 
 
 
 
 
 
 
 
 
77
 
78
+ except Exception as e:
79
+ return f"Error updating vector store: {str(e)}", []
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
80
 
81
+ def delete_documents(selected_docs):
82
+ if not selected_docs:
83
+ return "No documents selected for deletion.", []
84
+
85
+ uploaded_documents = load_documents()
86
+ embed = get_embeddings()
87
+
88
+ if os.path.exists("faiss_database"):
89
+ database = FAISS.load_local("faiss_database", embed, allow_dangerous_deserialization=True)
90
+
91
+ docs_to_keep = []
92
+ for doc in database.docstore._dict.values():
93
+ if doc.metadata.get("source") not in selected_docs:
94
+ docs_to_keep.append(doc)
95
+
96
+ if not docs_to_keep:
97
+ shutil.rmtree("faiss_database")
98
+ else:
99
+ new_database = FAISS.from_documents(docs_to_keep, embed)
100
+ new_database.save_local("faiss_database")
101
+
102
+ uploaded_documents = [doc for doc in uploaded_documents if doc["name"] not in selected_docs]
103
+ save_documents(uploaded_documents)
104
+
105
+ return f"Deleted documents: {', '.join(selected_docs)}", uploaded_documents
106
+
107
+ return "No documents to delete.", []
108
 
109
+ def get_response(query, temperature=0.2):
110
+ if not query.strip():
111
+ return "Please enter a question."
112
+
113
+ uploaded_documents = load_documents()
114
+ selected_docs = [doc["name"] for doc in uploaded_documents if doc["selected"]]
115
+
116
+ if not selected_docs:
117
+ return "Please select at least one document to search through."
118
+
119
+ embed = get_embeddings()
120
+ if not os.path.exists("faiss_database"):
121
+ return "No documents available. Please upload PDF documents first."
122
+
123
+ database = FAISS.load_local("faiss_database", embed, allow_dangerous_deserialization=True)
124
+
125
+ # Filter documents
126
+ filtered_docs = []
127
+ for doc in database.docstore._dict.values():
128
+ if isinstance(doc, Document) and doc.metadata.get("source") in selected_docs:
129
+ filtered_docs.append(doc)
130
+
131
+ if not filtered_docs:
132
+ return "No relevant information found in the selected documents."
133
+
134
+ filtered_db = FAISS.from_documents(filtered_docs, embed)
135
+ retriever = filtered_db.as_retriever(search_kwargs={"k": 5})
136
+ relevant_docs = retriever.get_relevant_documents(query)
137
+
138
+ context_str = "\n".join([doc.page_content for doc in relevant_docs])
139
+
140
+ messages = [
141
+ {"role": "system", "content": "You are a helpful assistant that provides accurate answers based on the given context."},
142
+ {"role": "user", "content": f"Context:\n{context_str}\n\nQuestion: {query}\n\nProvide a comprehensive answer based only on the given context."}
143
+ ]
144
+
145
+ client = InferenceClient(DEFAULT_MODEL, token=HF_TOKEN)
146
+
147
  try:
148
+ response = client.chat_completion(
149
+ messages=messages,
150
+ max_tokens=1000,
151
+ temperature=temperature,
152
+ top_p=0.9,
153
+ )
154
+ return response.choices[0].message.content
155
+ except Exception as e:
156
+ return f"Error generating response: {str(e)}"
157
+
158
+ def create_interface():
159
+ with gr.Blocks(title="PDF Question Answering System") as app:
160
+ gr.Markdown("# PDF Question Answering System")
161
 
162
+ with gr.Row():
163
+ with gr.Column():
164
+ files = gr.File(
165
+ label="Upload PDF Documents",
 
 
166
  file_types=[".pdf"],
167
+ multiple=True
168
  )
169
+ upload_button = gr.Button("Upload and Process")
170
+
171
+ with gr.Column():
172
+ doc_status = gr.Textbox(label="Status", interactive=False)
173
+ doc_list = gr.Checkboxgroup(
174
+ label="Available Documents",
175
+ choices=[],
176
+ interactive=True
177
+ )
178
+ delete_button = gr.Button("Delete Selected Documents")
179
+
180
+ with gr.Row():
181
+ with gr.Column():
182
+ question = gr.Textbox(label="Ask a question about the documents")
183
+ temperature = gr.Slider(
184
+ minimum=0.0,
185
+ maximum=1.0,
186
+ value=0.2,
187
+ step=0.1,
188
+ label="Temperature"
189
+ )
190
+ submit_button = gr.Button("Submit Question")
191
 
192
+ with gr.Column():
193
+ answer = gr.Textbox(label="Answer", interactive=False)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
194
 
195
+ # Event handlers
196
+ upload_button.click(
197
+ fn=update_vectors,
198
+ inputs=[files],
199
+ outputs=[doc_status, doc_list]
200
+ )
201
+
202
+ delete_button.click(
203
+ fn=delete_documents,
204
+ inputs=[doc_list],
205
+ outputs=[doc_status, doc_list]
206
+ )
207
+
208
+ submit_button.click(
209
+ fn=get_response,
210
+ inputs=[question, temperature],
211
+ outputs=[answer]
212
+ )
213
+
214
+ return app
215
 
216
  if __name__ == "__main__":
217
+ app = create_interface()
218
+ app.launch()