muddasser commited on
Commit
07f51e6
·
verified ·
1 Parent(s): a7b818e

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +101 -61
app.py CHANGED
@@ -3,7 +3,6 @@ import os
3
  import re
4
  import time
5
  import subprocess
6
- import tempfile
7
  import shutil
8
  import logging
9
  import psutil
@@ -20,21 +19,38 @@ from langchain_community.embeddings import HuggingFaceEmbeddings
20
  from langchain.schema import Document
21
  import chromedriver_autoinstaller
22
 
23
- # Try importing transformers with error handling
24
  try:
25
  from transformers import pipeline, AutoTokenizer, AutoModelForSeq2SeqLM
 
 
26
  except ImportError as e:
27
- st.error(f"Failed to import transformers: {str(e)}. Please ensure transformers==4.44.2 is installed correctly.")
28
  logging.error(f"Transformers import failed: {str(e)}")
29
- st.stop()
 
 
 
 
 
 
 
30
 
31
  # Set up logging
32
  logging.basicConfig(
33
  filename='/tmp/app.log',
34
- level=logging.INFO,
35
  format='%(asctime)s - %(levelname)s - %(message)s'
36
  )
37
 
 
 
 
 
 
 
 
 
38
  # Set page configuration
39
  st.set_page_config(
40
  page_title="Web Scraping + RAG Chatbot",
@@ -72,27 +88,22 @@ def cleanup_chromedriver_processes():
72
  if any('type=renderer' in arg for arg in cmdline):
73
  continue
74
  active_processes.append(f"{name} (PID {proc.pid})")
75
- proc.terminate() # Try graceful termination first
76
- try:
77
- proc.wait(timeout=3) # Wait for process to exit
78
- except psutil.TimeoutExpired:
79
- proc.kill() # Force kill if it doesn't exit
80
  logging.info(f"Terminated process {name} PID {proc.pid}")
81
- except (psutil.NoSuchProcess, psutil.AccessDenied):
82
  pass
83
  if active_processes:
84
  logging.info(f"Terminated processes: {', '.join(active_processes)}")
85
  else:
86
- logging.info("No Chrome-related processes found to terminate")
87
  except Exception as e:
88
  logging.warning(f"Error cleaning up processes: {str(e)}")
89
 
90
  def find_binary(binary_name):
91
  """Find binary path."""
92
  try:
93
- result = subprocess.check_output(
94
- ['which', binary_name], stderr=subprocess.DEVNULL
95
- ).decode().strip()
96
  logging.info(f"Found {binary_name} at: {result}")
97
  return result if result else None
98
  except Exception as e:
@@ -124,6 +135,7 @@ def check_disk_space():
124
 
125
  def setup_driver():
126
  """Set up Selenium WebDriver."""
 
127
  cleanup_chromedriver_processes()
128
  check_disk_space()
129
  chromium_path = os.getenv('CHROMIUM_PATH', '/usr/bin/chromium')
@@ -186,45 +198,56 @@ def setup_driver():
186
  options.add_argument('--ignore-certificate-errors')
187
  options.binary_location = chromium_path
188
 
189
- # Initialize ChromeDriver service with random port
190
- service = Service(executable_path=chromedriver_path, port=0)
191
- max_attempts = 5
192
- for attempt in range(max_attempts):
193
- try:
194
- logging.info(f"Attempt {attempt + 1}/{max_attempts} to start ChromeDriver")
195
- service.start()
196
- driver = webdriver.Chrome(service=service, options=options)
197
- driver.set_page_load_timeout(60)
198
- logging.info(f"ChromeDriver initialized successfully on attempt {attempt + 1}")
199
- return driver, service
200
- except WebDriverException as e:
201
- logging.warning(f"WebDriver init failed (attempt {attempt + 1}/{max_attempts}): {str(e)}")
202
- if attempt < max_attempts - 1:
203
- try:
204
- service.stop()
205
- except:
206
- pass
207
- time.sleep(3)
208
- else:
209
- logging.error(f"Failed to initialize WebDriver after {max_attempts} attempts: {str(e)}")
210
- st.error(f"Failed to initialize WebDriver: {str(e)}")
211
- return None, None
 
 
 
 
 
212
  return None, None
213
 
214
  def clean_text(text):
215
  """Clean and normalize scraped text."""
216
- text = re.sub(r'\s+', ' ', text)
217
- text = re.sub(r'[^\w\s.,!?;:]', ' ', text)
218
- return text.strip()
 
 
 
 
219
 
220
  def scrape_website(url):
221
  """Scrape data from the given URL."""
 
222
  driver, service = setup_driver()
223
  if not driver or not service:
224
- st.error("Failed to initialize WebDriver. Please check if Chromium and ChromeDriver are properly installed.")
 
225
  return None
226
  try:
227
- logging.info(f"Attempting to scrape {url}")
228
  driver.get(url)
229
  WebDriverWait(driver, 30).until(
230
  EC.presence_of_element_located((By.TAG_NAME, "body"))
@@ -242,11 +265,13 @@ def scrape_website(url):
242
  for by, value in content_selectors:
243
  try:
244
  main_content = driver.find_element(by, value)
 
245
  break
246
  except:
247
  continue
248
  if not main_content:
249
  main_content = driver.find_element(By.TAG_NAME, "body")
 
250
  text_content = main_content.text
251
  cleaned_content = clean_text(text_content)
252
  logging.info(f"Scraped {len(cleaned_content)} characters from {url}")
@@ -270,20 +295,25 @@ def scrape_website(url):
270
 
271
  @st.cache_resource
272
  def initialize_qa_model():
273
- """Initialize the QA model."""
274
  if st.session_state.qa_pipeline is None:
275
  try:
276
  with st.spinner("Loading FLAN-T5 model..."):
277
  model_name = "google/flan-t5-small"
278
  tokenizer = AutoTokenizer.from_pretrained(model_name)
279
  model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
280
- st.session_state.qa_pipeline = pipeline(
281
- "text2text-generation",
282
- model=model,
283
- tokenizer=tokenizer,
284
- max_length=200
285
- )
286
- return st.session_state.qa_pipeline
 
 
 
 
 
287
  except Exception as e:
288
  st.error(f"Failed to load QA model: {str(e)}")
289
  logging.error(f"Error loading QA model: {str(e)}")
@@ -305,6 +335,7 @@ def create_vector_store(text):
305
  model_kwargs={'device': 'cpu'}
306
  )
307
  vector_store = FAISS.from_documents(documents, embeddings)
 
308
  return vector_store
309
  except Exception as e:
310
  st.error(f"Error creating vector store: {str(e)}")
@@ -312,7 +343,7 @@ def create_vector_store(text):
312
  return None
313
 
314
  def answer_question(question):
315
- """Answer a question using RAG."""
316
  if st.session_state.vector_store is None:
317
  return "Please scrape a website first."
318
  if st.session_state.qa_pipeline is None:
@@ -326,13 +357,22 @@ def answer_question(question):
326
  Question: {question}
327
  Answer:
328
  """
329
- result = st.session_state.qa_pipeline(
330
- prompt,
331
- max_length=200,
332
- do_sample=False,
333
- temperature=0.1
334
- )
335
- return result[0]['generated_text'].strip()
 
 
 
 
 
 
 
 
 
336
  except Exception as e:
337
  logging.error(f"Error answering question: {str(e)}")
338
  return f"Error generating answer: {str(e)}"
@@ -348,7 +388,7 @@ app_mode = st.sidebar.radio("Choose a mode", ["Web Scraping", "Chat with Content
348
 
349
  if app_mode == "Web Scraping":
350
  st.header("🌐 Web Scraping")
351
- url = st.text_input("Enter URL to scrape", "https://en.wikipedia.org/wiki/Artificial_intelligence")
352
  if st.button("Scrape Website"):
353
  if url and is_valid_url(url):
354
  with st.spinner("Scraping website..."):
@@ -364,7 +404,7 @@ if app_mode == "Web Scraping":
364
  else:
365
  st.error("Failed to scrape the website. Check logs for details.")
366
  else:
367
- st.warning("Please enter a valid URL (e.g., https://en.wikipedia.org/wiki/Artificial_intelligence).")
368
 
369
  elif app_mode == "Chat with Content":
370
  st.header("💬 Chat with Scraped Content")
 
3
  import re
4
  import time
5
  import subprocess
 
6
  import shutil
7
  import logging
8
  import psutil
 
19
  from langchain.schema import Document
20
  import chromedriver_autoinstaller
21
 
22
+ # Try importing transformers with fallback
23
  try:
24
  from transformers import pipeline, AutoTokenizer, AutoModelForSeq2SeqLM
25
+ import transformers
26
+ logging.info(f"Transformers version: {transformers.__version__}")
27
  except ImportError as e:
28
+ st.error(f"Failed to import transformers: {str(e)}. Attempting fallback without pipeline.")
29
  logging.error(f"Transformers import failed: {str(e)}")
30
+ try:
31
+ from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
32
+ import transformers
33
+ logging.info(f"Fallback: Imported AutoTokenizer and AutoModelForSeq2SeqLM, version: {transformers.__version__}")
34
+ except ImportError as e:
35
+ st.error(f"Failed to import transformers fallback: {str(e)}. Please ensure transformers==4.44.2 and tokenizers==0.19.1 are installed.")
36
+ logging.error(f"Transformers fallback import failed: {str(e)}")
37
+ st.stop()
38
 
39
  # Set up logging
40
  logging.basicConfig(
41
  filename='/tmp/app.log',
42
+ level=logging.DEBUG, # Increased verbosity
43
  format='%(asctime)s - %(levelname)s - %(message)s'
44
  )
45
 
46
+ # Health check
47
+ logging.info("Starting application health check")
48
+ try:
49
+ logging.info(f"Python version: {subprocess.check_output(['python', '--version']).decode().strip()}")
50
+ logging.info(f"Pip list: {subprocess.check_output(['pip', 'list']).decode()}")
51
+ except Exception as e:
52
+ logging.error(f"Health check failed: {str(e)}")
53
+
54
  # Set page configuration
55
  st.set_page_config(
56
  page_title="Web Scraping + RAG Chatbot",
 
88
  if any('type=renderer' in arg for arg in cmdline):
89
  continue
90
  active_processes.append(f"{name} (PID {proc.pid})")
91
+ proc.terminate()
92
+ proc.wait(timeout=3)
 
 
 
93
  logging.info(f"Terminated process {name} PID {proc.pid}")
94
+ except (psutil.NoSuchProcess, psutil.AccessDenied, psutil.TimeoutExpired):
95
  pass
96
  if active_processes:
97
  logging.info(f"Terminated processes: {', '.join(active_processes)}")
98
  else:
99
+ logging.info("No Chrome-related processes found")
100
  except Exception as e:
101
  logging.warning(f"Error cleaning up processes: {str(e)}")
102
 
103
  def find_binary(binary_name):
104
  """Find binary path."""
105
  try:
106
+ result = subprocess.check_output(['which', binary_name], stderr=subprocess.DEVNULL).decode().strip()
 
 
107
  logging.info(f"Found {binary_name} at: {result}")
108
  return result if result else None
109
  except Exception as e:
 
135
 
136
  def setup_driver():
137
  """Set up Selenium WebDriver."""
138
+ logging.info("Setting up Selenium WebDriver")
139
  cleanup_chromedriver_processes()
140
  check_disk_space()
141
  chromium_path = os.getenv('CHROMIUM_PATH', '/usr/bin/chromium')
 
198
  options.add_argument('--ignore-certificate-errors')
199
  options.binary_location = chromium_path
200
 
201
+ # Initialize ChromeDriver
202
+ try:
203
+ service = Service(executable_path=chromedriver_path, port=0)
204
+ max_attempts = 3
205
+ for attempt in range(max_attempts):
206
+ try:
207
+ logging.info(f"Attempt {attempt + 1}/{max_attempts} to start ChromeDriver")
208
+ service.start()
209
+ driver = webdriver.Chrome(service=service, options=options)
210
+ driver.set_page_load_timeout(60)
211
+ logging.info(f"ChromeDriver initialized successfully on attempt {attempt + 1}")
212
+ return driver, service
213
+ except WebDriverException as e:
214
+ logging.warning(f"WebDriver init failed (attempt {attempt + 1}/{max_attempts}): {str(e)}")
215
+ if attempt < max_attempts - 1:
216
+ try:
217
+ service.stop()
218
+ except:
219
+ pass
220
+ time.sleep(2)
221
+ else:
222
+ logging.error(f"Failed to initialize WebDriver after {max_attempts} attempts: {str(e)}")
223
+ st.error(f"Failed to initialize WebDriver: {str(e)}")
224
+ return None, None
225
+ except Exception as e:
226
+ logging.error(f"Error initializing ChromeDriver service: {str(e)}")
227
+ st.error(f"Failed to initialize ChromeDriver service: {str(e)}")
228
+ return None, None
229
  return None, None
230
 
231
  def clean_text(text):
232
  """Clean and normalize scraped text."""
233
+ try:
234
+ text = re.sub(r'\s+', ' ', text)
235
+ text = re.sub(r'[^\w\s.,!?;:]', ' ', text)
236
+ return text.strip()
237
+ except Exception as e:
238
+ logging.error(f"Error cleaning text: {str(e)}")
239
+ return text
240
 
241
  def scrape_website(url):
242
  """Scrape data from the given URL."""
243
+ logging.info(f"Starting scrape for URL: {url}")
244
  driver, service = setup_driver()
245
  if not driver or not service:
246
+ st.error("Failed to initialize WebDriver. Please check if Chromium and ChromeDriver are installed.")
247
+ logging.error("WebDriver initialization failed")
248
  return None
249
  try:
250
+ logging.info(f"Navigating to {url}")
251
  driver.get(url)
252
  WebDriverWait(driver, 30).until(
253
  EC.presence_of_element_located((By.TAG_NAME, "body"))
 
265
  for by, value in content_selectors:
266
  try:
267
  main_content = driver.find_element(by, value)
268
+ logging.info(f"Found content with selector: {by}={value}")
269
  break
270
  except:
271
  continue
272
  if not main_content:
273
  main_content = driver.find_element(By.TAG_NAME, "body")
274
+ logging.info("Falling back to body tag for content")
275
  text_content = main_content.text
276
  cleaned_content = clean_text(text_content)
277
  logging.info(f"Scraped {len(cleaned_content)} characters from {url}")
 
295
 
296
  @st.cache_resource
297
  def initialize_qa_model():
298
+ """Initialize the QA model with fallback."""
299
  if st.session_state.qa_pipeline is None:
300
  try:
301
  with st.spinner("Loading FLAN-T5 model..."):
302
  model_name = "google/flan-t5-small"
303
  tokenizer = AutoTokenizer.from_pretrained(model_name)
304
  model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
305
+ try:
306
+ st.session_state.qa_pipeline = pipeline(
307
+ "text2text-generation",
308
+ model=model,
309
+ tokenizer=tokenizer,
310
+ max_length=200
311
+ )
312
+ logging.info("Initialized QA pipeline successfully")
313
+ except NameError:
314
+ logging.warning("Pipeline not available, using raw model and tokenizer")
315
+ st.session_state.qa_pipeline = (model, tokenizer)
316
+ return st.session_state.qa_pipeline
317
  except Exception as e:
318
  st.error(f"Failed to load QA model: {str(e)}")
319
  logging.error(f"Error loading QA model: {str(e)}")
 
335
  model_kwargs={'device': 'cpu'}
336
  )
337
  vector_store = FAISS.from_documents(documents, embeddings)
338
+ logging.info("FAISS vector store created successfully")
339
  return vector_store
340
  except Exception as e:
341
  st.error(f"Error creating vector store: {str(e)}")
 
343
  return None
344
 
345
  def answer_question(question):
346
+ """Answer a question using RAG with fallback."""
347
  if st.session_state.vector_store is None:
348
  return "Please scrape a website first."
349
  if st.session_state.qa_pipeline is None:
 
357
  Question: {question}
358
  Answer:
359
  """
360
+ if isinstance(st.session_state.qa_pipeline, tuple):
361
+ # Fallback: Use raw model and tokenizer
362
+ model, tokenizer = st.session_state.qa_pipeline
363
+ inputs = tokenizer(prompt, return_tensors="pt", max_length=512, truncation=True)
364
+ outputs = model.generate(**inputs, max_length=200, do_sample=False, temperature=0.1)
365
+ answer = tokenizer.decode(outputs[0], skip_special_tokens=True)
366
+ else:
367
+ # Use pipeline
368
+ result = st.session_state.qa_pipeline(
369
+ prompt,
370
+ max_length=200,
371
+ do_sample=False,
372
+ temperature=0.1
373
+ )
374
+ answer = result[0]['generated_text']
375
+ return answer.strip()
376
  except Exception as e:
377
  logging.error(f"Error answering question: {str(e)}")
378
  return f"Error generating answer: {str(e)}"
 
388
 
389
  if app_mode == "Web Scraping":
390
  st.header("🌐 Web Scraping")
391
+ url = st.text_input("Enter URL to scrape", "https://example.com")
392
  if st.button("Scrape Website"):
393
  if url and is_valid_url(url):
394
  with st.spinner("Scraping website..."):
 
404
  else:
405
  st.error("Failed to scrape the website. Check logs for details.")
406
  else:
407
+ st.warning("Please enter a valid URL (e.g., https://example.com).")
408
 
409
  elif app_mode == "Chat with Content":
410
  st.header("💬 Chat with Scraped Content")