msmaje commited on
Commit
7cd9b93
Β·
verified Β·
1 Parent(s): 6a0c640

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +614 -90
app.py CHANGED
@@ -11,187 +11,711 @@ logging.basicConfig(level=logging.INFO)
11
  logger = logging.getLogger(__name__)
12
 
13
  try:
14
- from langchain_community.document_loaders import PyPDFDirectoryLoader
15
  from langchain.text_splitter import RecursiveCharacterTextSplitter
16
  from langchain_community.embeddings import HuggingFaceEmbeddings
17
  from langchain_community.vectorstores import FAISS
18
  from langchain.prompts import PromptTemplate
19
  from langchain.chains import RetrievalQA
20
- from langchain.llms import HuggingFaceHub
 
21
  LANGCHAIN_AVAILABLE = True
22
  except ImportError as e:
23
  logger.error(f"LangChain import error: {e}")
24
  LANGCHAIN_AVAILABLE = False
25
 
 
26
  PDF_FOLDER_PATH = "./pdfs"
27
  os.makedirs(PDF_FOLDER_PATH, exist_ok=True)
28
 
 
29
  vectorstore = None
30
  retrieval_qa = None
31
  embedding_model = None
 
 
32
  PRELOADED_PDFS = os.path.exists(PDF_FOLDER_PATH) and len(os.listdir(PDF_FOLDER_PATH)) > 0
33
 
34
  def initialize_models():
 
35
  global embedding_model
 
36
  try:
 
37
  embedding_model = HuggingFaceEmbeddings(
38
  model_name="sentence-transformers/all-MiniLM-L6-v2",
39
  model_kwargs={'device': 'cpu'}
40
  )
41
-
 
42
  hf_token = os.getenv("HUGGINGFACEHUB_API_TOKEN")
43
  if not hf_token:
44
- return False, "❌ HuggingFace API token not found"
45
-
46
- return True, "βœ… Models initialized"
 
47
  except Exception as e:
48
- logger.error(f"Init error: {e}")
49
- return False, str(e)
50
 
51
  def create_llm():
 
52
  hf_token = os.getenv("HUGGINGFACEHUB_API_TOKEN")
53
- if not hf_token:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
54
  return create_fallback_llm()
55
 
56
- models_to_try = [
57
- "mistralai/Mistral-7B-Instruct-v0.2",
58
- "google/flan-t5-base"
59
- ]
60
-
61
- for model_id in models_to_try:
62
- try:
63
- llm = HuggingFaceHub(
64
- repo_id=model_id,
65
- huggingfacehub_api_token=hf_token,
66
- model_kwargs={
67
- "temperature": 0.7,
68
- "max_length": 512,
69
- "top_p": 0.9,
70
- "top_k": 50
71
- }
72
- )
73
- return llm
74
- except Exception as e:
75
- logger.warning(f"Model {model_id} failed: {e}")
76
- return create_fallback_llm()
77
-
78
  def create_fallback_llm():
 
79
  class FallbackLLM:
80
  def __call__(self, prompt):
81
- return "Model is unavailable. Try again later."
 
82
  def invoke(self, prompt):
83
  return self.__call__(prompt)
 
84
  return FallbackLLM()
85
 
86
  def load_preloaded_pdfs(chunk_size=1000, chunk_overlap=200):
 
87
  global vectorstore, retrieval_qa, embedding_model
88
-
89
  if not LANGCHAIN_AVAILABLE:
90
- return "❌ LangChain not available"
91
-
92
  if not PRELOADED_PDFS:
93
- return "❌ No PDFs found"
94
-
95
  try:
 
96
  if embedding_model is None:
97
- success, msg = initialize_models()
98
  if not success:
99
- return msg
100
-
 
101
  loader = PyPDFDirectoryLoader(PDF_FOLDER_PATH)
102
  documents = loader.load()
 
103
  if not documents:
104
- return "❌ No documents loaded"
105
-
106
- splitter = RecursiveCharacterTextSplitter(
107
- chunk_size=chunk_size, chunk_overlap=chunk_overlap
 
 
108
  )
109
- chunks = splitter.split_documents(documents)
 
 
110
  vectorstore = FAISS.from_documents(chunks, embedding_model)
111
  retriever = vectorstore.as_retriever(search_kwargs={"k": 5})
112
-
 
113
  prompt_template = """
114
- Use the following context to answer the question. If you cannot find the answer, say so.
115
 
116
  Context:
117
  {context}
118
 
119
  Question: {question}
120
 
121
- Answer:
122
  """
123
  prompt = PromptTemplate(
124
- input_variables=["context", "question"],
125
  template=prompt_template
126
  )
127
-
 
128
  llm = create_llm()
129
- retrieval_qa = RetrievalQA.from_chain_type(
130
- llm=llm,
131
- chain_type="stuff",
132
- retriever=retriever,
133
- return_source_documents=True,
134
- chain_type_kwargs={"prompt": prompt}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
135
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
136
 
137
- return f"βœ… {len(documents)} docs loaded, {len(chunks)} chunks"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
138
  except Exception as e:
139
- return f"❌ Error: {str(e)}"
 
140
 
141
  def answer_question(question):
 
142
  global retrieval_qa
 
143
  if not question.strip():
144
- return "❌ Enter a question", ""
 
145
  if retrieval_qa is None:
146
- return "❌ Process documents first", ""
147
-
148
  try:
 
149
  result = retrieval_qa({"query": question})
150
- answer = result.get("result", "No answer")
151
-
 
 
152
  sources = []
153
  for i, doc in enumerate(result.get("source_documents", []), 1):
154
  source = doc.metadata.get("source", "Unknown")
155
  page = doc.metadata.get("page", "Unknown")
156
- preview = doc.page_content[:200] + "..." if len(doc.page_content) > 200 else doc.page_content
157
- sources.append(f"**Source {i}:** {Path(source).name} (Page {page})\n{preview}")
158
-
159
- return answer, "\n\n".join(sources)
 
 
 
 
160
  except Exception as e:
161
- return f"❌ Error: {str(e)}", ""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
162
 
163
  def create_interface():
164
- with gr.Blocks(title="RAG PDF QA") as demo:
165
- gr.Markdown("## PDF QA with LangChain + HuggingFaceHub")
166
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
167
  with gr.Row():
168
- with gr.Column():
169
- pdf_files = gr.File(file_types=[".pdf"], file_count="multiple", label="Upload PDFs")
170
- chunk_size = gr.Slider(200, 2000, value=1000, label="Chunk Size")
171
- chunk_overlap = gr.Slider(0, 500, value=200, label="Chunk Overlap")
172
- process_btn = gr.Button("πŸ”„ Process PDFs")
173
- process_output = gr.Textbox(label="Processing Result")
174
-
175
- with gr.Column():
176
- question = gr.Textbox(label="Ask a Question")
177
- ask_btn = gr.Button("πŸ€” Ask")
178
- answer = gr.Textbox(label="Answer")
179
- sources = gr.Textbox(label="Sources")
180
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
181
  process_btn.click(
 
 
 
 
 
 
182
  fn=load_preloaded_pdfs,
183
  inputs=[chunk_size, chunk_overlap],
184
  outputs=[process_output]
185
  )
186
-
 
 
 
 
 
 
187
  ask_btn.click(
188
  fn=answer_question,
189
- inputs=[question],
190
- outputs=[answer, sources]
191
  )
192
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
193
  return demo
194
 
195
  if __name__ == "__main__":
196
- demo = create_interface()
197
- demo.launch(share=True)
 
 
 
 
 
 
 
 
 
 
 
11
  logger = logging.getLogger(__name__)
12
 
13
  try:
14
+ from langchain_community.document_loaders import PyPDFDirectoryLoader, PyPDFLoader
15
  from langchain.text_splitter import RecursiveCharacterTextSplitter
16
  from langchain_community.embeddings import HuggingFaceEmbeddings
17
  from langchain_community.vectorstores import FAISS
18
  from langchain.prompts import PromptTemplate
19
  from langchain.chains import RetrievalQA
20
+ # This is the key change: Import HuggingFaceHub instead of HuggingFaceEndpoint
21
+ from langchain_community.llms import HuggingFaceHub
22
  LANGCHAIN_AVAILABLE = True
23
  except ImportError as e:
24
  logger.error(f"LangChain import error: {e}")
25
  LANGCHAIN_AVAILABLE = False
26
 
27
+ # Create PDFs folder if it doesn't exist
28
  PDF_FOLDER_PATH = "./pdfs"
29
  os.makedirs(PDF_FOLDER_PATH, exist_ok=True)
30
 
31
+ # Global variables for the RAG system
32
  vectorstore = None
33
  retrieval_qa = None
34
  embedding_model = None
35
+
36
+ # Check for pre-existing PDF folder
37
  PRELOADED_PDFS = os.path.exists(PDF_FOLDER_PATH) and len(os.listdir(PDF_FOLDER_PATH)) > 0
38
 
39
  def initialize_models():
40
+ """Initialize the embedding model and LLM"""
41
  global embedding_model
42
+
43
  try:
44
+ # Initialize embedding model
45
  embedding_model = HuggingFaceEmbeddings(
46
  model_name="sentence-transformers/all-MiniLM-L6-v2",
47
  model_kwargs={'device': 'cpu'}
48
  )
49
+
50
+ # Get HuggingFace token from environment
51
  hf_token = os.getenv("HUGGINGFACEHUB_API_TOKEN")
52
  if not hf_token:
53
+ return False, "❌ HuggingFace API token not found in environment variables"
54
+
55
+ return True, "βœ… Models initialized successfully"
56
+
57
  except Exception as e:
58
+ logger.error(f"Model initialization error: {e}")
59
+ return False, f"❌ Error initializing models: {str(e)}"
60
 
61
  def create_llm():
62
+ """Create and return the LLM instance with improved error handling"""
63
  hf_token = os.getenv("HUGGINGFACEHUB_API_TOKEN")
64
+
65
+ try:
66
+ # The crucial change: Use HuggingFaceHub directly as suggested
67
+ # Note: You need to specify a repo_id that is a text generation model.
68
+ # "mistralai/Mistral-7B-Instruct-v0.2" is a good choice for instruction following.
69
+ llm = HuggingFaceHub(
70
+ repo_id="mistralai/Mistral-7B-Instruct-v0.2", # Using the suggested model
71
+ huggingfacehub_api_token=hf_token,
72
+ model_kwargs={
73
+ "temperature": 0.7,
74
+ "max_length": 512, # Note: max_new_tokens is typically preferred for generation length
75
+ "do_sample": True,
76
+ "top_p": 0.9,
77
+ "top_k": 50
78
+ }
79
+ )
80
+ logger.info(f"Successfully initialized LLM with model: mistralai/Mistral-7B-Instruct-v0.2")
81
+ return llm
82
+
83
+ except Exception as e:
84
+ logger.error(f"LLM creation error: {e}")
85
+ # Return a simple fallback that doesn't use HuggingFace API
86
  return create_fallback_llm()
87
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
88
  def create_fallback_llm():
89
+ """Create a simple fallback LLM for basic responses"""
90
  class FallbackLLM:
91
  def __call__(self, prompt):
92
+ return "I apologize, but I'm experiencing technical difficulties with the language model. Please try again later or contact support."
93
+
94
  def invoke(self, prompt):
95
  return self.__call__(prompt)
96
+
97
  return FallbackLLM()
98
 
99
  def load_preloaded_pdfs(chunk_size=1000, chunk_overlap=200):
100
+ """Load PDFs from the pre-existing folder"""
101
  global vectorstore, retrieval_qa, embedding_model
102
+
103
  if not LANGCHAIN_AVAILABLE:
104
+ return "❌ LangChain is not available. Please check the installation."
105
+
106
  if not PRELOADED_PDFS:
107
+ return "❌ No pre-loaded PDFs found in ./pdfs folder."
108
+
109
  try:
110
+ # Initialize models if not already done
111
  if embedding_model is None:
112
+ success, message = initialize_models()
113
  if not success:
114
+ return message
115
+
116
+ # Load documents from pre-existing folder
117
  loader = PyPDFDirectoryLoader(PDF_FOLDER_PATH)
118
  documents = loader.load()
119
+
120
  if not documents:
121
+ return "❌ No documents were loaded from the PDFs folder."
122
+
123
+ # Split documents into chunks
124
+ text_splitter = RecursiveCharacterTextSplitter(
125
+ chunk_size=int(chunk_size),
126
+ chunk_overlap=int(chunk_overlap)
127
  )
128
+ chunks = text_splitter.split_documents(documents)
129
+
130
+ # Create vector store
131
  vectorstore = FAISS.from_documents(chunks, embedding_model)
132
  retriever = vectorstore.as_retriever(search_kwargs={"k": 5})
133
+
134
+ # Setup prompt template
135
  prompt_template = """
136
+ Use the following context to answer the question. If you cannot find the answer in the context, say "I don't have enough information to answer this question."
137
 
138
  Context:
139
  {context}
140
 
141
  Question: {question}
142
 
143
+ Helpful Answer:
144
  """
145
  prompt = PromptTemplate(
146
+ input_variables=["context", "question"],
147
  template=prompt_template
148
  )
149
+
150
+ # Initialize LLM using the updated function
151
  llm = create_llm()
152
+
153
+ # Create RetrievalQA chain with error handling
154
+ try:
155
+ retrieval_qa = RetrievalQA.from_chain_type(
156
+ llm=llm,
157
+ chain_type="stuff",
158
+ retriever=retriever,
159
+ return_source_documents=True,
160
+ chain_type_kwargs={"prompt": prompt}
161
+ )
162
+ except Exception as chain_error:
163
+ logger.error(f"Chain creation error: {chain_error}")
164
+ return f"❌ Error creating QA chain: {str(chain_error)}"
165
+
166
+ pdf_files = [f for f in os.listdir(PDF_FOLDER_PATH) if f.endswith('.pdf')]
167
+ return f"βœ… Successfully processed {len(documents)} documents from {len(pdf_files)} PDF files into {len(chunks)} chunks. Ready for questions!"
168
+
169
+ except Exception as e:
170
+ logger.error(f"Pre-loaded PDF processing error: {e}")
171
+ return f"❌ Error processing pre-loaded PDFs: {str(e)}"
172
+
173
+ def extract_zip_to_pdfs(zip_file):
174
+ """Extract uploaded ZIP file to PDFs folder"""
175
+ if not zip_file:
176
+ return "❌ Please upload a ZIP file."
177
+
178
+ try:
179
+ # Create PDFs directory if it doesn't exist
180
+ os.makedirs(PDF_FOLDER_PATH, exist_ok=True)
181
+
182
+ # Extract ZIP file
183
+ with zipfile.ZipFile(zip_file.name, 'r') as zip_ref:
184
+ # Extract only PDF files
185
+ pdf_files = [f for f in zip_ref.namelist() if f.lower().endswith('.pdf')]
186
+
187
+ if not pdf_files:
188
+ return "❌ No PDF files found in the ZIP archive."
189
+
190
+ for pdf_file in pdf_files:
191
+ # Extract to PDFs folder
192
+ zip_ref.extract(pdf_file, PDF_FOLDER_PATH)
193
+
194
+ # If file is in a subfolder, move it to the root of PDFs folder
195
+ extracted_path = os.path.join(PDF_FOLDER_PATH, pdf_file)
196
+ if os.path.dirname(pdf_file): # File is in a subfolder
197
+ new_path = os.path.join(PDF_FOLDER_PATH, os.path.basename(pdf_file))
198
+ shutil.move(extracted_path, new_path)
199
+ # Clean up empty directories
200
+ try:
201
+ os.rmdir(os.path.dirname(extracted_path))
202
+ except:
203
+ pass
204
+
205
+ global PRELOADED_PDFS
206
+ PRELOADED_PDFS = True
207
+
208
+ return f"βœ… Successfully extracted {len(pdf_files)} PDF files. Now click 'Load Pre-existing PDFs' to process them."
209
+
210
+ except Exception as e:
211
+ return f"❌ Error extracting ZIP file: {str(e)}"
212
+
213
+ def process_pdfs(pdf_files, chunk_size, chunk_overlap):
214
+ """Process uploaded PDF files and create vector store"""
215
+ global vectorstore, retrieval_qa, embedding_model
216
+
217
+ if not LANGCHAIN_AVAILABLE:
218
+ return "❌ LangChain is not available. Please check the installation."
219
+
220
+ if not pdf_files:
221
+ return "❌ Please upload at least one PDF file or use pre-loaded PDFs."
222
+
223
+ try:
224
+ # Initialize models if not already done
225
+ if embedding_model is None:
226
+ success, message = initialize_models()
227
+ if not success:
228
+ return message
229
+
230
+ # Create temporary directory for PDFs
231
+ temp_dir = tempfile.mkdtemp()
232
+
233
+ # Save uploaded files to temp directory
234
+ for pdf_file in pdf_files:
235
+ if pdf_file is not None:
236
+ temp_path = os.path.join(temp_dir, os.path.basename(pdf_file.name))
237
+ shutil.copy2(pdf_file.name, temp_path)
238
+
239
+ # Load documents
240
+ loader = PyPDFDirectoryLoader(temp_dir)
241
+ documents = loader.load()
242
+
243
+ if not documents:
244
+ return "❌ No documents were loaded. Please check your PDF files."
245
+
246
+ # Split documents into chunks
247
+ text_splitter = RecursiveCharacterTextSplitter(
248
+ chunk_size=int(chunk_size),
249
+ chunk_overlap=int(chunk_overlap)
250
  )
251
+ chunks = text_splitter.split_documents(documents)
252
+
253
+ # Create vector store
254
+ vectorstore = FAISS.from_documents(chunks, embedding_model)
255
+ retriever = vectorstore.as_retriever(search_kwargs={"k": 5})
256
+
257
+ # Setup prompt template
258
+ prompt_template = """
259
+ Use the following context to answer the question. If you cannot find the answer in the context, say "I don't have enough information to answer this question."
260
+
261
+ Context:
262
+ {context}
263
+
264
+ Question: {question}
265
 
266
+ Helpful Answer:
267
+ """
268
+ prompt = PromptTemplate(
269
+ input_variables=["context", "question"],
270
+ template=prompt_template
271
+ )
272
+
273
+ # Initialize LLM using the updated function
274
+ llm = create_llm()
275
+
276
+ # Create RetrievalQA chain with error handling
277
+ try:
278
+ retrieval_qa = RetrievalQA.from_chain_type(
279
+ llm=llm,
280
+ chain_type="stuff",
281
+ retriever=retriever,
282
+ return_source_documents=True,
283
+ chain_type_kwargs={"prompt": prompt}
284
+ )
285
+ except Exception as chain_error:
286
+ logger.error(f"Chain creation error: {chain_error}")
287
+ return f"❌ Error creating QA chain: {str(chain_error)}"
288
+
289
+ # Clean up temp directory
290
+ shutil.rmtree(temp_dir)
291
+
292
+ return f"βœ… Successfully processed {len(documents)} documents into {len(chunks)} chunks. Ready for questions!"
293
+
294
  except Exception as e:
295
+ logger.error(f"PDF processing error: {e}")
296
+ return f"❌ Error processing PDFs: {str(e)}"
297
 
298
  def answer_question(question):
299
+ """Answer a question using the RAG system with improved error handling"""
300
  global retrieval_qa
301
+
302
  if not question.strip():
303
+ return "❌ Please enter a question.", ""
304
+
305
  if retrieval_qa is None:
306
+ return "❌ Please upload and process PDF files first.", ""
307
+
308
  try:
309
+ # Get answer from RAG system with timeout and error handling
310
  result = retrieval_qa({"query": question})
311
+
312
+ answer = result.get("result", "No answer generated")
313
+
314
+ # Format source documents
315
  sources = []
316
  for i, doc in enumerate(result.get("source_documents", []), 1):
317
  source = doc.metadata.get("source", "Unknown")
318
  page = doc.metadata.get("page", "Unknown")
319
+ content_preview = doc.page_content[:200] + "..." if len(doc.page_content) > 200 else doc.page_content
320
+
321
+ sources.append(f"**Source {i}:**\n- File: {Path(source).name}\n- Page: {page}\n- Preview: {content_preview}\n")
322
+
323
+ sources_text = "\n".join(sources) if sources else "No sources found."
324
+
325
+ return answer, sources_text
326
+
327
  except Exception as e:
328
+ logger.error(f"Question answering error: {e}")
329
+
330
+ # Provide a fallback response using just the retriever
331
+ try:
332
+ if vectorstore is not None:
333
+ # Get relevant documents directly from vectorstore
334
+ docs = vectorstore.similarity_search(question, k=3)
335
+
336
+ fallback_answer = "I found some relevant content in your documents:\n\n"
337
+ sources = []
338
+
339
+ for i, doc in enumerate(docs, 1):
340
+ source = doc.metadata.get("source", "Unknown")
341
+ page = doc.metadata.get("page", "Unknown")
342
+ content_preview = doc.page_content[:300] + "..." if len(doc.page_content) > 300 else doc.page_content
343
+
344
+ fallback_answer += f"**Excerpt {i}:** {content_preview}\n\n"
345
+ sources.append(f"**Source {i}:**\n- File: {Path(source).name}\n- Page: {page}\n")
346
+
347
+ sources_text = "\n".join(sources)
348
+
349
+ return fallback_answer + "\n*Note: This is a direct search result due to a technical issue with the AI model.*", sources_text
350
+ else:
351
+ return f"❌ Error answering question: {str(e)}", ""
352
+
353
+ except Exception as fallback_error:
354
+ logger.error(f"Fallback error: {fallback_error}")
355
+ return f"❌ Error answering question: {str(e)}", ""
356
+
357
+ def get_device_info():
358
+ """Simple function to detect if mobile (basic detection)"""
359
+ return
360
+ <script>
361
+ function isMobile() {
362
+ return window.innerWidth <= 768;
363
+ }
364
+
365
+ function adjustLayout() {
366
+ const isMob = isMobile();
367
+ const root = document.documentElement;
368
+ if (isMob) {
369
+ root.style.setProperty('--mobile-mode', '1');
370
+ } else {
371
+ root.style.setProperty('--mobile-mode', '0');
372
+ }
373
+ }
374
+
375
+ window.addEventListener('resize', adjustLayout);
376
+ adjustLayout();
377
+ </script>
378
 
379
  def create_interface():
380
+ """Create the fully responsive Gradio interface"""
381
+
382
+ # Custom CSS for better responsiveness
383
+ custom_css =
384
+ /* Base responsive styles */
385
+ .gradio-container {
386
+ max-width: 100% !important;
387
+ margin: 0 auto;
388
+ padding: 10px;
389
+ }
390
+
391
+ /* Mobile-first responsive design */
392
+ @media (max-width: 768px) {
393
+ .gradio-container {
394
+ padding: 5px;
395
+ }
396
+
397
+ /* Stack elements vertically on mobile */
398
+ .gr-row {
399
+ flex-direction: column !important;
400
+ gap: 10px !important;
401
+ }
402
+
403
+ /* Full width on mobile */
404
+ .gr-column {
405
+ width: 100% !important;
406
+ min-width: 100% !important;
407
+ }
408
+
409
+ /* Adjust component spacing */
410
+ .gr-form > * {
411
+ margin-bottom: 8px !important;
412
+ }
413
+
414
+ /* Better button sizing */
415
+ .gr-button {
416
+ width: 100% !important;
417
+ min-height: 44px !important;
418
+ font-size: 14px !important;
419
+ }
420
+
421
+ /* Text input improvements */
422
+ .gr-textbox textarea {
423
+ min-height: 60px !important;
424
+ font-size: 16px !important; /* Prevents zoom on iOS */
425
+ }
426
+
427
+ /* File upload improvements */
428
+ .gr-file {
429
+ min-height: 100px !important;
430
+ }
431
+
432
+ /* Slider improvements */
433
+ .gr-slider {
434
+ margin: 10px 0 !important;
435
+ }
436
+
437
+ /* Tab improvements */
438
+ .gr-tab-nav {
439
+ flex-wrap: wrap !important;
440
+ }
441
+
442
+ .gr-tab-nav > button {
443
+ flex: 1 1 auto !important;
444
+ min-width: 80px !important;
445
+ font-size: 12px !important;
446
+ }
447
+ }
448
+
449
+ /* Tablet styles */
450
+ @media (min-width: 769px) and (max-width: 1024px) {
451
+ .gradio-container {
452
+ padding: 15px;
453
+ }
454
+
455
+ .gr-button {
456
+ min-height: 40px !important;
457
+ }
458
+ }
459
+
460
+ /* Desktop styles */
461
+ @media (min-width: 1025px) {
462
+ .gradio-container {
463
+ max-width: 1400px;
464
+ padding: 20px;
465
+ }
466
+ }
467
+
468
+ /* Improve readability */
469
+ .gr-markdown h1 {
470
+ font-size: clamp(1.5rem, 4vw, 2.5rem) !important;
471
+ line-height: 1.2 !important;
472
+ margin-bottom: 1rem !important;
473
+ }
474
+
475
+ .gr-markdown h3 {
476
+ font-size: clamp(1.1rem, 3vw, 1.4rem) !important;
477
+ margin: 1rem 0 0.5rem 0 !important;
478
+ }
479
+
480
+ .gr-markdown p, .gr-markdown li {
481
+ font-size: clamp(0.9rem, 2.5vw, 1rem) !important;
482
+ line-height: 1.5 !important;
483
+ }
484
+
485
+ /* Status text improvements */
486
+ .gr-textbox[data-testid="textbox"] {
487
+ font-family: monospace !important;
488
+ font-size: clamp(0.8rem, 2vw, 0.9rem) !important;
489
+ }
490
+
491
+ /* Accessibility improvements */
492
+ .gr-button:focus,
493
+ .gr-textbox:focus,
494
+ .gr-file:focus {
495
+ outline: 2px solid #2563eb !important;
496
+ outline-offset: 2px !important;
497
+ }
498
+
499
+ /* Dark mode considerations */
500
+ @media (prefers-color-scheme: dark) {
501
+ .gr-button {
502
+ border: 1px solid #374151 !important;
503
+ }
504
+ }
505
+ """
506
+
507
+ with gr.Blocks(
508
+ title="PDF RAG System",
509
+ theme=gr.themes.Soft(),
510
+ css=custom_css
511
+ ) as demo:
512
+
513
+ # Add device detection script
514
+ gr.HTML(get_device_info())
515
+
516
+ gr.Markdown("""
517
+ # πŸ“š PDF Question Answering System
518
+
519
+ Upload your PDF documents and ask questions about their content!
520
+
521
+ **Quick Start:**
522
+ 1. Upload PDFs or use pre-loaded ones
523
+ 2. Click Process to prepare your documents
524
+ 3. Ask questions about the content
525
+ """)
526
+
527
+ # Check for pre-loaded PDFs
528
+ if PRELOADED_PDFS:
529
+ gr.Markdown(
530
+ <div style="background: linear-gradient(90deg, #10b981, #059669);
531
+ color: white; padding: 12px; border-radius: 8px; margin: 10px 0;">
532
+ πŸŽ‰ <strong>Pre-loaded PDFs detected!</strong> Use the 'Load Pre-existing PDFs' button to get started quickly.
533
+ </div>
534
+ )
535
+
536
+ # Main layout - responsive columns
537
  with gr.Row():
538
+ # Left column - Upload & Settings (collapses to full width on mobile)
539
+ with gr.Column(scale=1, min_width=300):
540
+ gr.Markdown("### πŸ“„ Document Management")
541
+
542
+ with gr.Tabs():
543
+ with gr.TabItem("πŸ“ Upload PDFs"):
544
+ pdf_files = gr.File(
545
+ label="Select PDF Files",
546
+ file_count="multiple",
547
+ file_types=[".pdf"],
548
+ height=120
549
+ )
550
+ process_btn = gr.Button(
551
+ "πŸ”„ Process PDFs",
552
+ variant="primary",
553
+ size="lg"
554
+ )
555
+
556
+ with gr.TabItem("πŸ—‚οΈ ZIP Upload"):
557
+ zip_file = gr.File(
558
+ label="Upload ZIP (with PDFs)",
559
+ file_count="single",
560
+ file_types=[".zip"],
561
+ height=80
562
+ )
563
+ extract_btn = gr.Button(
564
+ "πŸ“¦ Extract ZIP",
565
+ variant="secondary",
566
+ size="lg"
567
+ )
568
+ extract_output = gr.Textbox(
569
+ label="Extraction Status",
570
+ lines=2,
571
+ max_lines=3
572
+ )
573
+
574
+ with gr.TabItem("πŸ’Ύ Pre-loaded"):
575
+ if PRELOADED_PDFS:
576
+ pdf_list = [f for f in os.listdir(PDF_FOLDER_PATH) if f.endswith('.pdf')]
577
+ gr.Markdown(f"**Found {len(pdf_list)} PDF files**")
578
+
579
+ # Show files in a more mobile-friendly way
580
+ if len(pdf_list) <= 5:
581
+ for pdf in pdf_list:
582
+ gr.Markdown(f"πŸ“„ {pdf}")
583
+ else:
584
+ for pdf in pdf_list[:3]:
585
+ gr.Markdown(f"πŸ“„ {pdf}")
586
+ gr.Markdown(f"*... and {len(pdf_list) - 3} more files*")
587
+ else:
588
+ gr.Markdown("No pre-loaded PDFs found.")
589
+
590
+ preload_btn = gr.Button(
591
+ "πŸ“š Load Pre-existing PDFs",
592
+ variant="primary",
593
+ size="lg",
594
+ interactive=PRELOADED_PDFS
595
+ )
596
+
597
+ # Settings section - collapsible on mobile
598
+ with gr.Accordion("βš™οΈ Advanced Settings", open=False):
599
+ chunk_size = gr.Slider(
600
+ minimum=200,
601
+ maximum=2000,
602
+ value=1000,
603
+ step=100,
604
+ label="Chunk Size",
605
+ info="Larger chunks = more context, smaller = more precise"
606
+ )
607
+
608
+ chunk_overlap = gr.Slider(
609
+ minimum=0,
610
+ maximum=500,
611
+ value=200,
612
+ step=50,
613
+ label="Chunk Overlap",
614
+ info="Overlap between text chunks"
615
+ )
616
+
617
+ # Status display
618
+ process_output = gr.Textbox(
619
+ label="πŸ“Š Processing Status",
620
+ lines=3,
621
+ max_lines=5,
622
+ placeholder="Status updates will appear here..."
623
+ )
624
+
625
+ # Right column - Q&A Section (collapses to full width on mobile)
626
+ with gr.Column(scale=2, min_width=400):
627
+ gr.Markdown("### ❓ Ask Questions")
628
+
629
+ question_input = gr.Textbox(
630
+ label="Your Question",
631
+ placeholder="What would you like to know about your documents?",
632
+ lines=2,
633
+ max_lines=4
634
+ )
635
+
636
+ ask_btn = gr.Button(
637
+ "πŸ€” Ask Question",
638
+ variant="secondary",
639
+ size="lg"
640
+ )
641
+
642
+ # Results section - stack vertically on mobile
643
+ with gr.Row():
644
+ answer_output = gr.Textbox(
645
+ label="πŸ’‘ Answer",
646
+ lines=6,
647
+ max_lines=12,
648
+ placeholder="Your answer will appear here..."
649
+ )
650
+
651
+ sources_output = gr.Textbox(
652
+ label="πŸ“š Sources",
653
+ lines=6,
654
+ max_lines=12,
655
+ placeholder="Source references will appear here..."
656
+ )
657
+
658
+ # Event handlers (unchanged)
659
  process_btn.click(
660
+ fn=process_pdfs,
661
+ inputs=[pdf_files, chunk_size, chunk_overlap],
662
+ outputs=[process_output]
663
+ )
664
+
665
+ preload_btn.click(
666
  fn=load_preloaded_pdfs,
667
  inputs=[chunk_size, chunk_overlap],
668
  outputs=[process_output]
669
  )
670
+
671
+ extract_btn.click(
672
+ fn=extract_zip_to_pdfs,
673
+ inputs=[zip_file],
674
+ outputs=[extract_output]
675
+ )
676
+
677
  ask_btn.click(
678
  fn=answer_question,
679
+ inputs=[question_input],
680
+ outputs=[answer_output, sources_output]
681
  )
682
+
683
+ question_input.submit(
684
+ fn=answer_question,
685
+ inputs=[question_input],
686
+ outputs=[answer_output, sources_output]
687
+ )
688
+
689
+ # Example questions - more mobile-friendly
690
+ with gr.Accordion("πŸ’‘ Example Questions", open=False):
691
+ gr.Markdown("""
692
+ **Try asking:**
693
+ - What are the main topics in these documents?
694
+ - Can you summarize the key findings?
695
+ - What data is available for [specific topic]?
696
+ - What are the differences between X and Y?
697
+ """)
698
+
699
+ # Footer with helpful info
700
+ gr.Markdown("""
701
+ ---
702
+ <div style="text-align: center; color: #666; font-size: 0.9em;">
703
+ πŸ’‘ <strong>Tip:</strong> For best results, ask specific questions about your documents
704
+ </div>
705
+ """)
706
+
707
  return demo
708
 
709
  if __name__ == "__main__":
710
+ # Check if running on HuggingFace Spaces
711
+ if os.getenv("SPACE_ID"):
712
+ demo = create_interface()
713
+ demo.launch(
714
+ server_name="0.0.0.0",
715
+ server_port=7860,
716
+ share=False
717
+ )
718
+ else:
719
+ # Local development
720
+ demo = create_interface()
721
+ demo.launch(share=True)