muddasser commited on
Commit
afe3575
·
verified ·
1 Parent(s): b310e16

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +78 -67
app.py CHANGED
@@ -5,8 +5,6 @@ from selenium.webdriver.chrome.options import Options
5
  from selenium.webdriver.common.by import By
6
  from selenium.webdriver.support.ui import WebDriverWait
7
  from selenium.webdriver.support import expected_conditions as EC
8
- from webdriver_manager.chrome import ChromeDriverManager
9
- from webdriver_manager.core.os_manager import ChromeType
10
  from transformers import pipeline, AutoTokenizer, AutoModelForSeq2SeqLM
11
  from sentence_transformers import SentenceTransformer
12
  from langchain.text_splitter import RecursiveCharacterTextSplitter
@@ -77,12 +75,10 @@ def find_binary(binary_name):
77
  """Find binary path."""
78
  try:
79
  result = subprocess.check_output(
80
- ['find', '/', '-name', binary_name, '-type', 'f', '-executable'],
81
  stderr=subprocess.DEVNULL
82
  ).decode().strip()
83
- paths = result.split('\n') if result else []
84
- logging.info(f"Found {binary_name} at: {paths}")
85
- return paths[0] if paths else None
86
  except Exception as e:
87
  logging.error(f"Error finding {binary_name}: {str(e)}")
88
  return None
@@ -107,14 +103,28 @@ def setup_driver():
107
  """Set up Selenium WebDriver."""
108
  cleanup_chromedriver_processes() # Clean up before starting
109
  try:
 
 
 
110
  chromium_path = os.getenv('CHROMIUM_PATH', '/usr/bin/chromium')
111
  chromedriver_path = os.getenv('CHROMEDRIVER_PATH', '/usr/bin/chromedriver')
112
 
113
- # Verify binaries
114
  if not os.path.exists(chromium_path):
115
  chromium_path = find_binary('chromium') or find_binary('chromium-browser')
 
 
 
 
116
  if not os.path.exists(chromedriver_path):
117
  chromedriver_path = find_binary('chromedriver')
 
 
 
 
 
 
 
118
 
119
  check_versions(chromium_path, chromedriver_path)
120
 
@@ -127,21 +137,14 @@ def setup_driver():
127
  options.add_argument('--disable-extensions')
128
  options.add_argument('--disable-background-networking')
129
  options.add_argument('--window-size=1920,1080')
130
- options.add_argument('--remote-debugging-port=0') # Random port to avoid conflicts
131
  options.add_argument('--ignore-certificate-errors')
132
  options.add_argument('--disable-web-security')
133
- options.binary_location = chromium_path if chromium_path else '/usr/bin/chromium'
134
-
135
- # Initialize ChromeDriver
136
- try:
137
- subprocess.run(['chmod', '+x', chromedriver_path], check=True)
138
- service = Service(chromedriver_path)
139
- except:
140
- logging.info("Falling back to webdriver-manager")
141
- chromedriver_path = ChromeDriverManager(chrome_type=ChromeType.CHROMIUM).install()
142
- subprocess.run(['chmod', '+x', chromedriver_path], check=True)
143
- service = Service(chromedriver_path)
144
 
 
 
 
145
  max_attempts = 3
146
  for attempt in range(max_attempts):
147
  try:
@@ -149,11 +152,13 @@ def setup_driver():
149
  logging.warning(f"Port {service.port} in use, stopping service")
150
  service.stop()
151
  time.sleep(1)
 
152
  service.start()
153
  driver = webdriver.Chrome(service=service, options=options)
154
  driver.set_page_load_timeout(60)
155
  logging.info(f"ChromeDriver initialized on port {service.port}")
156
  return driver, service
 
157
  except WebDriverException as e:
158
  logging.warning(f"WebDriver init failed (attempt {attempt + 1}/{max_attempts}): {str(e)}")
159
  if attempt < max_attempts - 1:
@@ -164,6 +169,7 @@ def setup_driver():
164
  time.sleep(2)
165
  else:
166
  raise
 
167
  except Exception as e:
168
  logging.error(f"Error setting up WebDriver: {str(e)}")
169
  logging.error(traceback.format_exc())
@@ -180,57 +186,62 @@ def scrape_website(url):
180
  """Scrape data from the given URL."""
181
  driver, service = setup_driver()
182
  if not driver or not service:
 
183
  return None
184
- max_attempts = 3
185
- for attempt in range(max_attempts):
186
- try:
187
- logging.info(f"Attempting to scrape {url} (attempt {attempt + 1}/{max_attempts})")
188
- driver.get(url)
189
- WebDriverWait(driver, 30).until(
190
- EC.presence_of_element_located((By.TAG_NAME, "body"))
191
- )
192
- title = driver.title
 
 
 
 
 
 
 
 
 
 
 
 
 
 
193
  try:
194
- main_content = WebDriverWait(driver, 10).until(
195
- EC.presence_of_element_located((By.ID, "content"))
196
- )
197
  except:
198
- try:
199
- main_content = driver.find_element(By.CLASS_NAME, "mw-parser-output")
200
- except:
201
- main_content = driver.find_element(By.TAG_NAME, "body")
202
- text_content = main_content.text
203
- cleaned_content = clean_text(text_content)
204
- logging.info(f"Scraped {len(cleaned_content)} characters from {url}")
205
- return {
206
- "title": title,
207
- "content": cleaned_content,
208
- "url": url
209
- }
210
- except (WebDriverException, TimeoutException) as e:
211
- logging.warning(f"Scraping failed for {url} (attempt {attempt + 1}/{max_attempts}): {str(e)}")
212
- if attempt < max_attempts - 1:
213
- try:
214
- driver.quit()
215
- service.stop()
216
- except:
217
- pass
218
- driver, service = setup_driver()
219
- if not driver:
220
- return None
221
- time.sleep(2)
222
- else:
223
- logging.error(f"Error scraping {url}: {str(e)}")
224
- logging.error(traceback.format_exc())
225
- st.error(f"Error scraping {url}: {str(e)}")
226
- return None
227
- finally:
228
- try:
229
- driver.quit()
230
- service.stop()
231
- logging.info("WebDriver and service stopped")
232
- except Exception as e:
233
- logging.warning(f"Error quitting driver: {str(e)}")
234
 
235
  @st.cache_resource
236
  def initialize_qa_model():
 
5
  from selenium.webdriver.common.by import By
6
  from selenium.webdriver.support.ui import WebDriverWait
7
  from selenium.webdriver.support import expected_conditions as EC
 
 
8
  from transformers import pipeline, AutoTokenizer, AutoModelForSeq2SeqLM
9
  from sentence_transformers import SentenceTransformer
10
  from langchain.text_splitter import RecursiveCharacterTextSplitter
 
75
  """Find binary path."""
76
  try:
77
  result = subprocess.check_output(
78
+ ['which', binary_name],
79
  stderr=subprocess.DEVNULL
80
  ).decode().strip()
81
+ return result if result else None
 
 
82
  except Exception as e:
83
  logging.error(f"Error finding {binary_name}: {str(e)}")
84
  return None
 
103
  """Set up Selenium WebDriver."""
104
  cleanup_chromedriver_processes() # Clean up before starting
105
  try:
106
+ # Set custom cache directory to avoid permission issues
107
+ os.environ['WDM_CACHE_DIR'] = '/tmp/.wdm'
108
+
109
  chromium_path = os.getenv('CHROMIUM_PATH', '/usr/bin/chromium')
110
  chromedriver_path = os.getenv('CHROMEDRIVER_PATH', '/usr/bin/chromedriver')
111
 
112
+ # Verify binaries exist
113
  if not os.path.exists(chromium_path):
114
  chromium_path = find_binary('chromium') or find_binary('chromium-browser')
115
+ if not chromium_path:
116
+ st.error("Chromium not found. Please ensure it's installed.")
117
+ return None, None
118
+
119
  if not os.path.exists(chromedriver_path):
120
  chromedriver_path = find_binary('chromedriver')
121
+ if not chromedriver_path:
122
+ st.error("ChromeDriver not found. Please ensure it's installed.")
123
+ return None, None
124
+
125
+ # Ensure executables have proper permissions
126
+ subprocess.run(['chmod', '+x', chromedriver_path], check=True)
127
+ subprocess.run(['chmod', '+x', chromium_path], check=True)
128
 
129
  check_versions(chromium_path, chromedriver_path)
130
 
 
137
  options.add_argument('--disable-extensions')
138
  options.add_argument('--disable-background-networking')
139
  options.add_argument('--window-size=1920,1080')
140
+ options.add_argument('--remote-debugging-port=0')
141
  options.add_argument('--ignore-certificate-errors')
142
  options.add_argument('--disable-web-security')
143
+ options.binary_location = chromium_path
 
 
 
 
 
 
 
 
 
 
144
 
145
+ # Initialize ChromeDriver service
146
+ service = Service(executable_path=chromedriver_path)
147
+
148
  max_attempts = 3
149
  for attempt in range(max_attempts):
150
  try:
 
152
  logging.warning(f"Port {service.port} in use, stopping service")
153
  service.stop()
154
  time.sleep(1)
155
+
156
  service.start()
157
  driver = webdriver.Chrome(service=service, options=options)
158
  driver.set_page_load_timeout(60)
159
  logging.info(f"ChromeDriver initialized on port {service.port}")
160
  return driver, service
161
+
162
  except WebDriverException as e:
163
  logging.warning(f"WebDriver init failed (attempt {attempt + 1}/{max_attempts}): {str(e)}")
164
  if attempt < max_attempts - 1:
 
169
  time.sleep(2)
170
  else:
171
  raise
172
+
173
  except Exception as e:
174
  logging.error(f"Error setting up WebDriver: {str(e)}")
175
  logging.error(traceback.format_exc())
 
186
  """Scrape data from the given URL."""
187
  driver, service = setup_driver()
188
  if not driver or not service:
189
+ st.error("Failed to initialize WebDriver. Please check if Chromium and ChromeDriver are properly installed.")
190
  return None
191
+
192
+ try:
193
+ logging.info(f"Attempting to scrape {url}")
194
+ driver.get(url)
195
+ WebDriverWait(driver, 30).until(
196
+ EC.presence_of_element_located((By.TAG_NAME, "body"))
197
+ )
198
+
199
+ # Get page title
200
+ title = driver.title
201
+
202
+ # Try multiple selectors for main content
203
+ content_selectors = [
204
+ (By.ID, "content"),
205
+ (By.CLASS_NAME, "mw-parser-output"),
206
+ (By.TAG_NAME, "main"),
207
+ (By.CLASS_NAME, "main-content"),
208
+ (By.ID, "main"),
209
+ (By.TAG_NAME, "article")
210
+ ]
211
+
212
+ main_content = None
213
+ for by, value in content_selectors:
214
  try:
215
+ main_content = driver.find_element(by, value)
216
+ break
 
217
  except:
218
+ continue
219
+
220
+ if not main_content:
221
+ main_content = driver.find_element(By.TAG_NAME, "body")
222
+
223
+ text_content = main_content.text
224
+ cleaned_content = clean_text(text_content)
225
+ logging.info(f"Scraped {len(cleaned_content)} characters from {url}")
226
+
227
+ return {
228
+ "title": title,
229
+ "content": cleaned_content,
230
+ "url": url
231
+ }
232
+
233
+ except (WebDriverException, TimeoutException) as e:
234
+ logging.error(f"Error scraping {url}: {str(e)}")
235
+ st.error(f"Error scraping {url}: {str(e)}")
236
+ return None
237
+
238
+ finally:
239
+ try:
240
+ driver.quit()
241
+ service.stop()
242
+ logging.info("WebDriver and service stopped")
243
+ except Exception as e:
244
+ logging.warning(f"Error quitting driver: {str(e)}")
 
 
 
 
 
 
 
 
 
245
 
246
  @st.cache_resource
247
  def initialize_qa_model():