muddasser commited on
Commit
3641b91
·
verified ·
1 Parent(s): f00e167

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +36 -77
app.py CHANGED
@@ -1,25 +1,25 @@
1
  import streamlit as st
 
 
 
 
 
 
 
 
 
2
  from selenium import webdriver
3
  from selenium.webdriver.chrome.service import Service
4
  from selenium.webdriver.chrome.options import Options
5
  from selenium.webdriver.common.by import By
6
  from selenium.webdriver.support.ui import WebDriverWait
7
  from selenium.webdriver.support import expected_conditions as EC
 
8
  from transformers import pipeline, AutoTokenizer, AutoModelForSeq2SeqLM
9
  from langchain.text_splitter import RecursiveCharacterTextSplitter
10
  from langchain_community.vectorstores import FAISS
11
  from langchain_community.embeddings import HuggingFaceEmbeddings
12
  from langchain.schema import Document
13
- import logging
14
- import subprocess
15
- import traceback
16
- import os
17
- import re
18
- import time
19
- import psutil
20
- import tempfile
21
- import shutil
22
- from selenium.common.exceptions import WebDriverException, TimeoutException
23
 
24
  # Set up logging
25
  logging.basicConfig(
@@ -57,34 +57,24 @@ def cleanup_chromedriver_processes():
57
  """Kill any lingering ChromeDriver and Chrome processes."""
58
  try:
59
  for proc in psutil.process_iter(['name', 'cmdline']):
60
- if proc.info['name'] in ['chromedriver', 'chrome', 'chromium', 'chromium-browser']:
61
- try:
62
- # Skip renderer processes to avoid system issues
63
- cmdline = proc.info.get('cmdline', [])
64
  if any('type=renderer' in arg for arg in cmdline):
65
  continue
66
  proc.kill()
67
- logging.info(f"Killed process {proc.info['name']} PID {proc.pid}")
68
- except (psutil.NoSuchProcess, psutil.AccessDenied):
69
- pass
70
  except Exception as e:
71
  logging.warning(f"Error cleaning up processes: {str(e)}")
72
 
73
- def check_port(port):
74
- """Check if a port is in use."""
75
- try:
76
- result = subprocess.run(['netstat', '-tuln'], capture_output=True, text=True, timeout=5)
77
- return f':{port}' in result.stdout
78
- except Exception as e:
79
- logging.warning(f"Error checking port {port}: {str(e)}")
80
- return False
81
-
82
  def find_binary(binary_name):
83
  """Find binary path."""
84
  try:
85
  result = subprocess.check_output(
86
- ['which', binary_name],
87
- stderr=subprocess.DEVNULL
88
  ).decode().strip()
89
  return result if result else None
90
  except Exception as e:
@@ -109,7 +99,6 @@ def check_versions(chromium_path, chromedriver_path):
109
  def setup_driver():
110
  """Set up Selenium WebDriver with unique user data directory."""
111
  cleanup_chromedriver_processes()
112
-
113
  chromium_path = os.getenv('CHROMIUM_PATH', '/usr/bin/chromium')
114
  chromedriver_path = os.getenv('CHROMEDRIVER_PATH', '/usr/bin/chromedriver')
115
 
@@ -119,7 +108,6 @@ def setup_driver():
119
  if not chromium_path:
120
  st.error("Chromium not found. Please ensure it's installed.")
121
  return None, None
122
-
123
  if not os.path.exists(chromedriver_path):
124
  chromedriver_path = find_binary('chromedriver')
125
  if not chromedriver_path:
@@ -132,7 +120,6 @@ def setup_driver():
132
  subprocess.run(['chmod', '+x', chromedriver_path], check=True)
133
  except subprocess.CalledProcessError:
134
  st.warning(f"Could not set executable permissions on {chromedriver_path}")
135
-
136
  if not os.access(chromium_path, os.X_OK):
137
  try:
138
  subprocess.run(['chmod', '+x', chromium_path], check=True)
@@ -150,36 +137,25 @@ def setup_driver():
150
  options.add_argument('--disable-extensions')
151
  options.add_argument('--disable-background-networking')
152
  options.add_argument('--window-size=1920,1080')
153
- options.add_argument('--remote-debugging-port=0')
154
  options.add_argument('--ignore-certificate-errors')
155
- options.add_argument('--disable-web-security')
156
-
157
- # Create unique user data directory
158
- temp_dir = tempfile.mkdtemp()
159
- options.add_argument(f"--user-data-dir={temp_dir}")
160
-
161
  options.binary_location = chromium_path
162
 
163
- # Initialize ChromeDriver service
164
- service = Service(executable_path=chromedriver_path)
165
-
 
 
 
 
166
  max_attempts = 3
167
  for attempt in range(max_attempts):
168
  try:
169
- if check_port(service.port):
170
- logging.warning(f"Port {service.port} in use, stopping service")
171
- service.stop()
172
- time.sleep(1)
173
-
174
  service.start()
175
  driver = webdriver.Chrome(service=service, options=options)
176
  driver.set_page_load_timeout(60)
177
-
178
- # Store temp directory for cleanup
179
  driver.temp_dir = temp_dir
180
- logging.info(f"ChromeDriver initialized on port {service.port} with temp dir: {temp_dir}")
181
  return driver, service
182
-
183
  except WebDriverException as e:
184
  logging.warning(f"WebDriver init failed (attempt {attempt + 1}/{max_attempts}): {str(e)}")
185
  if attempt < max_attempts - 1:
@@ -189,13 +165,12 @@ def setup_driver():
189
  pass
190
  time.sleep(2)
191
  else:
192
- # Clean up temp directory if creation failed
193
  try:
194
- shutil.rmtree(temp_dir)
195
  except:
196
  pass
197
  logging.error(f"Failed to initialize WebDriver after {max_attempts} attempts: {str(e)}")
198
- st.error(f"Failed to initialize WebDriver after {max_attempts} attempts: {str(e)}")
199
  return None, None
200
  return None, None
201
 
@@ -211,18 +186,14 @@ def scrape_website(url):
211
  if not driver or not service:
212
  st.error("Failed to initialize WebDriver. Please check if Chromium and ChromeDriver are properly installed.")
213
  return None
214
-
215
  try:
216
  logging.info(f"Attempting to scrape {url}")
217
  driver.get(url)
218
  WebDriverWait(driver, 30).until(
219
  EC.presence_of_element_located((By.TAG_NAME, "body"))
220
  )
221
-
222
- # Get page title
223
  title = driver.title
224
-
225
- # Try multiple selectors for main content
226
  content_selectors = [
227
  (By.ID, "content"),
228
  (By.CLASS_NAME, "mw-parser-output"),
@@ -231,7 +202,6 @@ def scrape_website(url):
231
  (By.ID, "main"),
232
  (By.TAG_NAME, "article")
233
  ]
234
-
235
  main_content = None
236
  for by, value in content_selectors:
237
  try:
@@ -239,41 +209,29 @@ def scrape_website(url):
239
  break
240
  except:
241
  continue
242
-
243
  if not main_content:
244
  main_content = driver.find_element(By.TAG_NAME, "body")
245
-
246
  text_content = main_content.text
247
  cleaned_content = clean_text(text_content)
248
  logging.info(f"Scraped {len(cleaned_content)} characters from {url}")
249
-
250
  return {
251
  "title": title,
252
  "content": cleaned_content,
253
  "url": url
254
  }
255
-
256
  except (WebDriverException, TimeoutException) as e:
257
  logging.error(f"Error scraping {url}: {str(e)}")
258
  st.error(f"Error scraping {url}: {str(e)}")
259
  return None
260
-
261
  finally:
262
  try:
263
  driver.quit()
264
  service.stop()
265
-
266
- # Clean up temporary directory
267
- if hasattr(driver, 'temp_dir'):
268
- try:
269
- shutil.rmtree(driver.temp_dir)
270
- logging.info(f"Cleaned up temp directory: {driver.temp_dir}")
271
- except Exception as e:
272
- logging.warning(f"Error cleaning up temp directory: {str(e)}")
273
-
274
- logging.info("WebDriver and service stopped")
275
  except Exception as e:
276
- logging.warning(f"Error quitting driver: {str(e)}")
277
 
278
  @st.cache_resource
279
  def initialize_qa_model():
@@ -290,7 +248,7 @@ def initialize_qa_model():
290
  tokenizer=tokenizer,
291
  max_length=200
292
  )
293
- return st.session_state.qa_pipeline
294
  except Exception as e:
295
  st.error(f"Failed to load QA model: {str(e)}")
296
  logging.error(f"Error loading QA model: {str(e)}")
@@ -394,7 +352,8 @@ elif app_mode == "Chat with Content":
394
  elif app_mode == "About":
395
  st.header("ℹ️ About")
396
  st.markdown("""
397
- This app uses Selenium for web scraping, LangChain for vector storage with FAISS, and Hugging Face models for embeddings and question answering.
 
398
  - **Web Scraping**: Extracts text using headless Chromium.
399
  - **RAG**: Indexes content with sentence-transformers and answers questions using FLAN-T5.
400
  - **Tech Stack**: Python, Streamlit, Selenium, LangChain, Hugging Face Transformers, FAISS.
 
1
  import streamlit as st
2
+ import os
3
+ import re
4
+ import time
5
+ import uuid
6
+ import subprocess
7
+ import tempfile
8
+ import shutil
9
+ import logging
10
+ import psutil
11
  from selenium import webdriver
12
  from selenium.webdriver.chrome.service import Service
13
  from selenium.webdriver.chrome.options import Options
14
  from selenium.webdriver.common.by import By
15
  from selenium.webdriver.support.ui import WebDriverWait
16
  from selenium.webdriver.support import expected_conditions as EC
17
+ from selenium.common.exceptions import WebDriverException, TimeoutException
18
  from transformers import pipeline, AutoTokenizer, AutoModelForSeq2SeqLM
19
  from langchain.text_splitter import RecursiveCharacterTextSplitter
20
  from langchain_community.vectorstores import FAISS
21
  from langchain_community.embeddings import HuggingFaceEmbeddings
22
  from langchain.schema import Document
 
 
 
 
 
 
 
 
 
 
23
 
24
  # Set up logging
25
  logging.basicConfig(
 
57
  """Kill any lingering ChromeDriver and Chrome processes."""
58
  try:
59
  for proc in psutil.process_iter(['name', 'cmdline']):
60
+ try:
61
+ name = proc.info['name'].lower()
62
+ cmdline = proc.info.get('cmdline', [])
63
+ if 'chromedriver' in name or 'chrome' in name or 'chromium' in name:
64
  if any('type=renderer' in arg for arg in cmdline):
65
  continue
66
  proc.kill()
67
+ logging.info(f"Killed process {name} PID {proc.pid}")
68
+ except (psutil.NoSuchProcess, psutil.AccessDenied):
69
+ pass
70
  except Exception as e:
71
  logging.warning(f"Error cleaning up processes: {str(e)}")
72
 
 
 
 
 
 
 
 
 
 
73
  def find_binary(binary_name):
74
  """Find binary path."""
75
  try:
76
  result = subprocess.check_output(
77
+ ['which', binary_name], stderr=subprocess.DEVNULL
 
78
  ).decode().strip()
79
  return result if result else None
80
  except Exception as e:
 
99
  def setup_driver():
100
  """Set up Selenium WebDriver with unique user data directory."""
101
  cleanup_chromedriver_processes()
 
102
  chromium_path = os.getenv('CHROMIUM_PATH', '/usr/bin/chromium')
103
  chromedriver_path = os.getenv('CHROMEDRIVER_PATH', '/usr/bin/chromedriver')
104
 
 
108
  if not chromium_path:
109
  st.error("Chromium not found. Please ensure it's installed.")
110
  return None, None
 
111
  if not os.path.exists(chromedriver_path):
112
  chromedriver_path = find_binary('chromedriver')
113
  if not chromedriver_path:
 
120
  subprocess.run(['chmod', '+x', chromedriver_path], check=True)
121
  except subprocess.CalledProcessError:
122
  st.warning(f"Could not set executable permissions on {chromedriver_path}")
 
123
  if not os.access(chromium_path, os.X_OK):
124
  try:
125
  subprocess.run(['chmod', '+x', chromium_path], check=True)
 
137
  options.add_argument('--disable-extensions')
138
  options.add_argument('--disable-background-networking')
139
  options.add_argument('--window-size=1920,1080')
 
140
  options.add_argument('--ignore-certificate-errors')
 
 
 
 
 
 
141
  options.binary_location = chromium_path
142
 
143
+ # Create unique user data directory with UUID
144
+ temp_dir = os.path.join(tempfile.gettempdir(), f"chrome-data-{uuid.uuid4()}")
145
+ os.makedirs(temp_dir, exist_ok=True)
146
+ options.add_argument(f"--user-data-dir={temp_dir}")
147
+
148
+ # Initialize ChromeDriver service with random port
149
+ service = Service(executable_path=chromedriver_path, port=0)
150
  max_attempts = 3
151
  for attempt in range(max_attempts):
152
  try:
 
 
 
 
 
153
  service.start()
154
  driver = webdriver.Chrome(service=service, options=options)
155
  driver.set_page_load_timeout(60)
 
 
156
  driver.temp_dir = temp_dir
157
+ logging.info(f"ChromeDriver initialized with temp dir: {temp_dir}")
158
  return driver, service
 
159
  except WebDriverException as e:
160
  logging.warning(f"WebDriver init failed (attempt {attempt + 1}/{max_attempts}): {str(e)}")
161
  if attempt < max_attempts - 1:
 
165
  pass
166
  time.sleep(2)
167
  else:
 
168
  try:
169
+ shutil.rmtree(temp_dir, ignore_errors=True)
170
  except:
171
  pass
172
  logging.error(f"Failed to initialize WebDriver after {max_attempts} attempts: {str(e)}")
173
+ st.error(f"Failed to initialize WebDriver: {str(e)}")
174
  return None, None
175
  return None, None
176
 
 
186
  if not driver or not service:
187
  st.error("Failed to initialize WebDriver. Please check if Chromium and ChromeDriver are properly installed.")
188
  return None
189
+ temp_dir = getattr(driver, 'temp_dir', None)
190
  try:
191
  logging.info(f"Attempting to scrape {url}")
192
  driver.get(url)
193
  WebDriverWait(driver, 30).until(
194
  EC.presence_of_element_located((By.TAG_NAME, "body"))
195
  )
 
 
196
  title = driver.title
 
 
197
  content_selectors = [
198
  (By.ID, "content"),
199
  (By.CLASS_NAME, "mw-parser-output"),
 
202
  (By.ID, "main"),
203
  (By.TAG_NAME, "article")
204
  ]
 
205
  main_content = None
206
  for by, value in content_selectors:
207
  try:
 
209
  break
210
  except:
211
  continue
 
212
  if not main_content:
213
  main_content = driver.find_element(By.TAG_NAME, "body")
 
214
  text_content = main_content.text
215
  cleaned_content = clean_text(text_content)
216
  logging.info(f"Scraped {len(cleaned_content)} characters from {url}")
 
217
  return {
218
  "title": title,
219
  "content": cleaned_content,
220
  "url": url
221
  }
 
222
  except (WebDriverException, TimeoutException) as e:
223
  logging.error(f"Error scraping {url}: {str(e)}")
224
  st.error(f"Error scraping {url}: {str(e)}")
225
  return None
 
226
  finally:
227
  try:
228
  driver.quit()
229
  service.stop()
230
+ if temp_dir:
231
+ shutil.rmtree(temp_dir, ignore_errors=True)
232
+ logging.info(f"Cleaned up temp directory: {temp_dir}")
 
 
 
 
 
 
 
233
  except Exception as e:
234
+ logging.warning(f"Error cleaning up WebDriver: {str(e)}")
235
 
236
  @st.cache_resource
237
  def initialize_qa_model():
 
248
  tokenizer=tokenizer,
249
  max_length=200
250
  )
251
+ return st.session_state.qa_pipeline
252
  except Exception as e:
253
  st.error(f"Failed to load QA model: {str(e)}")
254
  logging.error(f"Error loading QA model: {str(e)}")
 
352
  elif app_mode == "About":
353
  st.header("ℹ️ About")
354
  st.markdown("""
355
+ This app uses Selenium for web scraping, LangChain for vector storage with FA的就是
356
+ FAISS, and Hugging Face models for embeddings and question answering.
357
  - **Web Scraping**: Extracts text using headless Chromium.
358
  - **RAG**: Indexes content with sentence-transformers and answers questions using FLAN-T5.
359
  - **Tech Stack**: Python, Streamlit, Selenium, LangChain, Hugging Face Transformers, FAISS.