muddasser commited on
Commit
9d1899c
·
verified ·
1 Parent(s): 9d74f3e

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +47 -60
app.py CHANGED
@@ -19,11 +19,10 @@ import traceback
19
  import os
20
  import re
21
  import time
22
- import numpy as np
23
- from typing import List
24
  from selenium.common.exceptions import WebDriverException, TimeoutException
25
 
26
- # Set up logging to /tmp/app.log
27
  logging.basicConfig(
28
  filename='/tmp/app.log',
29
  level=logging.DEBUG,
@@ -45,7 +44,7 @@ This app combines web scraping with Retrieval-Augmented Generation (RAG) to crea
45
  It can scrape websites, index the content, and answer your questions about the scraped content.
46
  """)
47
 
48
- # Initialize session state variables
49
  if 'scraped_content' not in st.session_state:
50
  st.session_state.scraped_content = ""
51
  if 'vector_store' not in st.session_state:
@@ -55,8 +54,27 @@ if 'chat_history' not in st.session_state:
55
  if 'qa_pipeline' not in st.session_state:
56
  st.session_state.qa_pipeline = None
57
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
58
  def find_binary(binary_name):
59
- """Find binary path using 'find' command."""
60
  try:
61
  result = subprocess.check_output(
62
  ['find', '/', '-name', binary_name, '-type', 'f', '-executable'],
@@ -70,64 +88,36 @@ def find_binary(binary_name):
70
  return None
71
 
72
  def check_versions(chromium_path, chromedriver_path):
73
- """Log Chromium and ChromeDriver versions for debugging."""
74
  if chromium_path:
75
  try:
76
  chromium_version = subprocess.check_output([chromium_path, '--version']).decode().strip()
77
  logging.info(f"Chromium path: {chromium_path}, version: {chromium_version}")
78
  except Exception as e:
79
  logging.error(f"Error checking Chromium version: {str(e)}")
80
- else:
81
- logging.error("Chromium binary not found")
82
  if chromedriver_path:
83
  try:
84
  chromedriver_version = subprocess.check_output([chromedriver_path, '--version']).decode().strip()
85
  logging.info(f"ChromeDriver path: {chromedriver_path}, version: {chromedriver_version}")
86
  except Exception as e:
87
  logging.error(f"Error checking ChromeDriver version: {str(e)}")
88
- else:
89
- logging.error("ChromeDriver binary not found")
90
 
91
  @st.cache_resource
92
  def setup_driver():
93
- """Set up Selenium WebDriver with headless Chromium."""
 
94
  try:
95
- # Define possible binary paths
96
- possible_chromium_paths = [
97
- '/usr/lib/chromium-browser/chromium-browser',
98
- '/usr/bin/chromium-browser',
99
- '/usr/bin/chromium',
100
- '/usr/bin/chrome'
101
- ]
102
- possible_chromedriver_paths = [
103
- '/usr/lib/chromium-browser/chromedriver',
104
- '/usr/bin/chromedriver'
105
- ]
106
-
107
- # Search for binaries
108
- chromium_path = None
109
- for path in possible_chromium_paths:
110
- if os.path.exists(path):
111
- chromium_path = path
112
- break
113
- if not chromium_path:
114
- chromium_path = find_binary('chromium') or find_binary('chromium-browser') or find_binary('chrome')
115
 
116
- chromedriver_path = None
117
- for path in possible_chromedriver_paths:
118
- if os.path.exists(path):
119
- chromedriver_path = path
120
- break
121
- if not chromedriver_path:
122
  chromedriver_path = find_binary('chromedriver')
123
 
124
- # Log versions for debugging
125
  check_versions(chromium_path, chromedriver_path)
126
 
127
- if not chromium_path:
128
- logging.warning("No Chromium binary found, using default path")
129
- chromium_path = '/usr/bin/chromium'
130
-
131
  # Configure Chrome options
132
  options = Options()
133
  options.add_argument('--headless=new')
@@ -137,36 +127,35 @@ def setup_driver():
137
  options.add_argument('--disable-extensions')
138
  options.add_argument('--disable-background-networking')
139
  options.add_argument('--window-size=1920,1080')
140
- options.add_argument('--remote-debugging-port=9222')
141
- options.add_argument('--ignore-certificate-errors') # Handle SSL issues
142
- options.add_argument('--disable-web-security') # Relax CORS for testing
143
- options.binary_location = chromium_path
144
 
145
  # Initialize ChromeDriver
146
- if chromedriver_path and os.path.exists(chromedriver_path):
147
- try:
148
- subprocess.run(['chmod', '+x', chromedriver_path], check=True)
149
- logging.info(f"Set executable permissions for {chromedriver_path}")
150
- except subprocess.CalledProcessError as e:
151
- logging.warning(f"Failed to chmod {chromedriver_path}: {str(e)}")
152
  service = Service(chromedriver_path)
153
- else:
154
- logging.info("Using webdriver-manager for ChromeDriver")
155
  chromedriver_path = ChromeDriverManager(chrome_type=ChromeType.CHROMIUM).install()
156
  subprocess.run(['chmod', '+x', chromedriver_path], check=True)
157
  service = Service(chromedriver_path)
158
 
159
- # Start WebDriver with retry
160
  max_attempts = 3
161
  for attempt in range(max_attempts):
162
  try:
 
 
 
 
163
  service.start()
164
  driver = webdriver.Chrome(service=service, options=options)
165
  driver.set_page_load_timeout(60)
166
  logging.info(f"ChromeDriver initialized on port {service.port}")
167
  return driver, service
168
  except WebDriverException as e:
169
- logging.warning(f"WebDriver initialization failed (attempt {attempt + 1}/{max_attempts}): {str(e)}")
170
  if attempt < max_attempts - 1:
171
  try:
172
  service.stop()
@@ -188,7 +177,7 @@ def clean_text(text):
188
  return text.strip()
189
 
190
  def scrape_website(url):
191
- """Scrape data from the given URL with retry logic."""
192
  driver, service = setup_driver()
193
  if not driver or not service:
194
  return None
@@ -200,9 +189,7 @@ def scrape_website(url):
200
  WebDriverWait(driver, 30).until(
201
  EC.presence_of_element_located((By.TAG_NAME, "body"))
202
  )
203
- # Get page title
204
  title = driver.title
205
- # Get main content (Wikipedia-specific)
206
  try:
207
  main_content = WebDriverWait(driver, 10).until(
208
  EC.presence_of_element_located((By.ID, "content"))
@@ -269,7 +256,7 @@ def initialize_qa_model():
269
 
270
  @st.cache_resource
271
  def create_vector_store(text):
272
- """Create a FAISS vector store from scraped text."""
273
  try:
274
  text_splitter = RecursiveCharacterTextSplitter(
275
  chunk_size=500,
 
19
  import os
20
  import re
21
  import time
22
+ import psutil
 
23
  from selenium.common.exceptions import WebDriverException, TimeoutException
24
 
25
+ # Set up logging
26
  logging.basicConfig(
27
  filename='/tmp/app.log',
28
  level=logging.DEBUG,
 
44
  It can scrape websites, index the content, and answer your questions about the scraped content.
45
  """)
46
 
47
+ # Initialize session state
48
  if 'scraped_content' not in st.session_state:
49
  st.session_state.scraped_content = ""
50
  if 'vector_store' not in st.session_state:
 
54
  if 'qa_pipeline' not in st.session_state:
55
  st.session_state.qa_pipeline = None
56
 
57
+ def cleanup_chromedriver_processes():
58
+ """Kill any lingering ChromeDriver processes."""
59
+ try:
60
+ for proc in psutil.process_iter(['name']):
61
+ if proc.info['name'] == 'chromedriver':
62
+ proc.kill()
63
+ logging.info(f"Killed ChromeDriver process PID {proc.pid}")
64
+ except Exception as e:
65
+ logging.warning(f"Error cleaning up ChromeDriver processes: {str(e)}")
66
+
67
+ def check_port(port):
68
+ """Check if a port is in use."""
69
+ try:
70
+ result = subprocess.run(['netstat', '-tuln'], capture_output=True, text=True)
71
+ return f':{port}' in result.stdout
72
+ except Exception as e:
73
+ logging.warning(f"Error checking port {port}: {str(e)}")
74
+ return False
75
+
76
  def find_binary(binary_name):
77
+ """Find binary path."""
78
  try:
79
  result = subprocess.check_output(
80
  ['find', '/', '-name', binary_name, '-type', 'f', '-executable'],
 
88
  return None
89
 
90
  def check_versions(chromium_path, chromedriver_path):
91
+ """Log Chromium and ChromeDriver versions."""
92
  if chromium_path:
93
  try:
94
  chromium_version = subprocess.check_output([chromium_path, '--version']).decode().strip()
95
  logging.info(f"Chromium path: {chromium_path}, version: {chromium_version}")
96
  except Exception as e:
97
  logging.error(f"Error checking Chromium version: {str(e)}")
 
 
98
  if chromedriver_path:
99
  try:
100
  chromedriver_version = subprocess.check_output([chromedriver_path, '--version']).decode().strip()
101
  logging.info(f"ChromeDriver path: {chromedriver_path}, version: {chromedriver_version}")
102
  except Exception as e:
103
  logging.error(f"Error checking ChromeDriver version: {str(e)}")
 
 
104
 
105
  @st.cache_resource
106
  def setup_driver():
107
+ """Set up Selenium WebDriver."""
108
+ cleanup_chromedriver_processes() # Clean up before starting
109
  try:
110
+ chromium_path = os.getenv('CHROMIUM_PATH', '/usr/bin/chromium')
111
+ chromedriver_path = os.getenv('CHROMEDRIVER_PATH', '/usr/bin/chromedriver')
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
112
 
113
+ # Verify binaries
114
+ if not os.path.exists(chromium_path):
115
+ chromium_path = find_binary('chromium') or find_binary('chromium-browser')
116
+ if not os.path.exists(chromedriver_path):
 
 
117
  chromedriver_path = find_binary('chromedriver')
118
 
 
119
  check_versions(chromium_path, chromedriver_path)
120
 
 
 
 
 
121
  # Configure Chrome options
122
  options = Options()
123
  options.add_argument('--headless=new')
 
127
  options.add_argument('--disable-extensions')
128
  options.add_argument('--disable-background-networking')
129
  options.add_argument('--window-size=1920,1080')
130
+ options.add_argument('--remote-debugging-port=0') # Random port to avoid conflicts
131
+ options.add_argument('--ignore-certificate-errors')
132
+ options.add_argument('--disable-web-security')
133
+ options.binary_location = chromium_path if chromium_path else '/usr/bin/chromium'
134
 
135
  # Initialize ChromeDriver
136
+ try:
137
+ subprocess.run(['chmod', '+x', chromedriver_path], check=True)
 
 
 
 
138
  service = Service(chromedriver_path)
139
+ except:
140
+ logging.info("Falling back to webdriver-manager")
141
  chromedriver_path = ChromeDriverManager(chrome_type=ChromeType.CHROMIUM).install()
142
  subprocess.run(['chmod', '+x', chromedriver_path], check=True)
143
  service = Service(chromedriver_path)
144
 
 
145
  max_attempts = 3
146
  for attempt in range(max_attempts):
147
  try:
148
+ if check_port(service.port):
149
+ logging.warning(f"Port {service.port} in use, stopping service")
150
+ service.stop()
151
+ time.sleep(1)
152
  service.start()
153
  driver = webdriver.Chrome(service=service, options=options)
154
  driver.set_page_load_timeout(60)
155
  logging.info(f"ChromeDriver initialized on port {service.port}")
156
  return driver, service
157
  except WebDriverException as e:
158
+ logging.warning(f"WebDriver init failed (attempt {attempt + 1}/{max_attempts}): {str(e)}")
159
  if attempt < max_attempts - 1:
160
  try:
161
  service.stop()
 
177
  return text.strip()
178
 
179
  def scrape_website(url):
180
+ """Scrape data from the given URL."""
181
  driver, service = setup_driver()
182
  if not driver or not service:
183
  return None
 
189
  WebDriverWait(driver, 30).until(
190
  EC.presence_of_element_located((By.TAG_NAME, "body"))
191
  )
 
192
  title = driver.title
 
193
  try:
194
  main_content = WebDriverWait(driver, 10).until(
195
  EC.presence_of_element_located((By.ID, "content"))
 
256
 
257
  @st.cache_resource
258
  def create_vector_store(text):
259
+ """Create a FAISS vector store."""
260
  try:
261
  text_splitter = RecursiveCharacterTextSplitter(
262
  chunk_size=500,