muddasser commited on
Commit
cdaa7f5
·
verified ·
1 Parent(s): c3282f3

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +47 -228
app.py CHANGED
@@ -1,23 +1,12 @@
1
  import streamlit as st
2
  import os
3
  import re
4
- import time
5
- import subprocess
6
- import shutil
7
  import logging
8
- import psutil
9
- from selenium import webdriver
10
- from selenium.webdriver.chrome.service import Service
11
- from selenium.webdriver.chrome.options import Options
12
- from selenium.webdriver.common.by import By
13
- from selenium.webdriver.support.ui import WebDriverWait
14
- from selenium.webdriver.support import expected_conditions as EC
15
- from selenium.common.exceptions import WebDriverException, TimeoutException
16
  from langchain.text_splitter import RecursiveCharacterTextSplitter
17
  from langchain_community.vectorstores import FAISS
18
  from langchain_community.embeddings import HuggingFaceEmbeddings
19
  from langchain.schema import Document
20
- import chromedriver_autoinstaller
21
 
22
  # Try importing transformers with fallback
23
  try:
@@ -38,21 +27,11 @@ except ImportError as e:
38
 
39
  # Set up logging
40
  logging.basicConfig(
41
- filename='/tmp/app.log',
42
  level=logging.DEBUG,
43
  format='%(asctime)s - %(levelname)s - %(message)s'
44
  )
45
 
46
- # Health check
47
- logging.info("Starting application health check")
48
- try:
49
- logging.info(f"Python version: {subprocess.check_output(['python', '--version']).decode().strip()}")
50
- logging.info(f"Pip list: {subprocess.check_output(['pip', 'list']).decode()}")
51
- logging.info(f"Environment variables: {os.environ}")
52
- logging.info(f"/tmp contents: {subprocess.check_output(['ls', '-la', '/tmp']).decode()}")
53
- except Exception as e:
54
- logging.error(f"Health check failed: {str(e)}")
55
-
56
  # Set page configuration
57
  st.set_page_config(
58
  page_title="Web Scraping + RAG Chatbot",
@@ -65,7 +44,7 @@ st.set_page_config(
65
  st.title("🕷️ Web Scraping + RAG Chatbot")
66
  st.markdown("""
67
  This app combines web scraping with Retrieval-Augmented Generation (RAG) to create an intelligent chatbot.
68
- It can scrape websites, index the content, and answer your questions about the scraped content.
69
  """)
70
 
71
  # Initialize session state
@@ -78,156 +57,6 @@ if 'chat_history' not in st.session_state:
78
  if 'qa_pipeline' not in st.session_state:
79
  st.session_state.qa_pipeline = None
80
 
81
- def cleanup_chromedriver_processes():
82
- """Kill any lingering ChromeDriver and Chrome processes."""
83
- try:
84
- active_processes = []
85
- for proc in psutil.process_iter(['name', 'cmdline', 'pid']):
86
- try:
87
- name = proc.info['name'].lower()
88
- cmdline = proc.info.get('cmdline', [])
89
- if 'chromedriver' in name or 'chrome' in name or 'chromium' in name:
90
- if any('type=renderer' in arg for arg in cmdline):
91
- continue
92
- active_processes.append(f"{name} (PID {proc.pid})")
93
- proc.terminate()
94
- try:
95
- proc.wait(timeout=5)
96
- logging.info(f"Terminated process {name} PID {proc.pid}")
97
- except psutil.TimeoutExpired:
98
- proc.kill()
99
- logging.info(f"Force killed process {name} PID {proc.pid}")
100
- except (psutil.NoSuchProcess, psutil.AccessDenied):
101
- pass
102
- if active_processes:
103
- logging.info(f"Terminated processes: {', '.join(active_processes)}")
104
- else:
105
- logging.info("No Chrome-related processes found")
106
- # Additional cleanup with killall
107
- try:
108
- subprocess.run(['killall', '-9', 'chromedriver', 'chromium', 'chrome'], check=False, capture_output=True)
109
- logging.info("Ran killall for chromedriver, chromium, and chrome")
110
- except Exception as e:
111
- logging.warning(f"Error running killall: {str(e)}")
112
- except Exception as e:
113
- logging.warning(f"Error cleaning up processes: {str(e)}")
114
-
115
- def find_binary(binary_name):
116
- """Find binary path."""
117
- try:
118
- result = subprocess.check_output(['which', binary_name], stderr=subprocess.DEVNULL).decode().strip()
119
- logging.info(f"Found {binary_name} at: {result}")
120
- return result if result else None
121
- except Exception as e:
122
- logging.error(f"Error finding {binary_name}: {str(e)}")
123
- return None
124
-
125
- def check_versions(chromium_path, chromedriver_path):
126
- """Log Chromium and ChromeDriver versions."""
127
- if chromedriver_path:
128
- try:
129
- chromedriver_version = subprocess.check_output([chromedriver_path, '--version']).decode().strip()
130
- logging.info(f"ChromeDriver path: {chromedriver_path}, version: {chromedriver_version}")
131
- except Exception as e:
132
- logging.error(f"Error checking ChromeDriver version: {str(e)}")
133
- if chromium_path:
134
- try:
135
- chromium_version = subprocess.check_output([chromium_path, '--version']).decode().strip()
136
- logging.info(f"Chromium path: {chromium_path}, version: {chromium_version}")
137
- except Exception as e:
138
- logging.error(f"Error checking Chromium version: {str(e)}")
139
-
140
- def check_disk_space():
141
- """Log available disk space for /tmp."""
142
- try:
143
- stat = shutil.disk_usage('/tmp')
144
- logging.info(f"Disk space for /tmp: Total={stat.total / (1024**3):.2f}GB, Used={stat.used / (1024**3):.2f}GB, Free={stat.free / (1024**3):.2f}GB")
145
- except Exception as e:
146
- logging.warning(f"Error checking disk space: {str(e)}")
147
-
148
- def setup_driver():
149
- """Set up Selenium WebDriver."""
150
- logging.info("Setting up Selenium WebDriver")
151
- cleanup_chromedriver_processes()
152
- check_disk_space()
153
- chromium_path = os.getenv('CHROMIUM_PATH', '/usr/bin/chromium')
154
-
155
- # Auto-install ChromeDriver
156
- chromedriver_path = None
157
- try:
158
- chromedriver_path = chromedriver_autoinstaller.install()
159
- if chromedriver_path:
160
- logging.info(f"Auto-installed ChromeDriver at: {chromedriver_path}")
161
- else:
162
- logging.warning("chromedriver_autoinstaller.install() returned None")
163
- except Exception as e:
164
- logging.error(f"Error auto-installing ChromeDriver: {str(e)}")
165
-
166
- # Fallback to environment variable or find_binary
167
- if not chromedriver_path:
168
- chromedriver_path = os.getenv('CHROMEDRIVER_PATH', '/usr/bin/chromedriver')
169
- logging.info(f"Falling back to CHROMEDRIVER_PATH: {chromedriver_path}")
170
- if not chromedriver_path or not os.path.exists(chromedriver_path):
171
- chromedriver_path = find_binary('chromedriver')
172
- if not chromedriver_path:
173
- st.error("ChromeDriver not found. Please ensure it's installed.")
174
- logging.error("No ChromeDriver found after fallback attempts")
175
- return None, None
176
-
177
- # Verify Chromium exists
178
- if not os.path.exists(chromium_path):
179
- chromium_path = find_binary('chromium') or find_binary('chromium-browser')
180
- if not chromium_path:
181
- st.error("Chromium not found. Please ensure it's installed.")
182
- logging.error("No Chromium found")
183
- return None, None
184
-
185
- # Check executable permissions
186
- if not os.access(chromedriver_path, os.X_OK):
187
- try:
188
- subprocess.run(['chmod', '+x', chromedriver_path], check=True)
189
- logging.info(f"Set executable permissions on {chromedriver_path}")
190
- except subprocess.CalledProcessError:
191
- st.warning(f"Could not set executable permissions on {chromedriver_path}")
192
- if not os.access(chromium_path, os.X_OK):
193
- try:
194
- subprocess.run(['chmod', '+x', chromium_path], check=True)
195
- logging.info(f"Set executable permissions on {chromium_path}")
196
- except subprocess.CalledProcessError:
197
- st.warning(f"Could not set executable permissions on {chromium_path}")
198
-
199
- check_versions(chromium_path, chromedriver_path)
200
-
201
- # Configure Chrome options
202
- options = Options()
203
- options.add_argument('--headless=new')
204
- options.add_argument('--no-sandbox')
205
- options.add_argument('--disable-dev-shm-usage')
206
- options.add_argument('--disable-gpu')
207
- options.add_argument('--disable-extensions')
208
- options.add_argument('--disable-background-networking')
209
- options.add_argument('--window-size=1920,1080')
210
- options.add_argument('--ignore-certificate-errors')
211
- options.binary_location = chromium_path
212
-
213
- # Initialize ChromeDriver
214
- try:
215
- service = Service(executable_path=chromedriver_path, port=0)
216
- logging.info("Starting ChromeDriver service")
217
- service.start()
218
- driver = webdriver.Chrome(service=service, options=options)
219
- driver.set_page_load_timeout(60)
220
- logging.info("ChromeDriver initialized successfully")
221
- return driver, service
222
- except WebDriverException as e:
223
- logging.error(f"Failed to initialize WebDriver: {str(e)}")
224
- st.error(f"Failed to initialize WebDriver: {str(e)}")
225
- return None, None
226
- except Exception as e:
227
- logging.error(f"Unexpected error initializing ChromeDriver: {str(e)}")
228
- st.error(f"Unexpected error initializing ChromeDriver: {str(e)}")
229
- return None, None
230
-
231
  def clean_text(text):
232
  """Clean and normalize scraped text."""
233
  try:
@@ -239,59 +68,49 @@ def clean_text(text):
239
  return text
240
 
241
  def scrape_website(url):
242
- """Scrape data from the given URL."""
243
  logging.info(f"Starting scrape for URL: {url}")
244
- driver, service = setup_driver()
245
- if not driver or not service:
246
- st.error("Failed to initialize WebDriver. Please check if Chromium and ChromeDriver are installed.")
247
- logging.error("WebDriver initialization failed")
248
- return None
249
- try:
250
- logging.info(f"Navigating to {url}")
251
- driver.get(url)
252
- WebDriverWait(driver, 30).until(
253
- EC.presence_of_element_located((By.TAG_NAME, "body"))
254
- )
255
- title = driver.title
256
- content_selectors = [
257
- (By.ID, "content"),
258
- (By.CLASS_NAME, "mw-parser-output"),
259
- (By.TAG_NAME, "main"),
260
- (By.CLASS_NAME, "main-content"),
261
- (By.ID, "main"),
262
- (By.TAG_NAME, "article")
263
- ]
264
- main_content = None
265
- for by, value in content_selectors:
266
- try:
267
- main_content = driver.find_element(by, value)
268
- logging.info(f"Found content with selector: {by}={value}")
269
- break
270
- except:
271
- continue
272
- if not main_content:
273
- main_content = driver.find_element(By.TAG_NAME, "body")
274
- logging.info("Falling back to body tag for content")
275
- text_content = main_content.text
276
- cleaned_content = clean_text(text_content)
277
- logging.info(f"Scraped {len(cleaned_content)} characters from {url}")
278
- return {
279
- "title": title,
280
- "content": cleaned_content,
281
- "url": url
282
- }
283
- except (WebDriverException, TimeoutException) as e:
284
- logging.error(f"Error scraping {url}: {str(e)}")
285
- st.error(f"Error scraping {url}: {str(e)}")
286
- return None
287
- finally:
288
  try:
289
- driver.quit()
290
- service.stop()
291
- logging.info("WebDriver and service stopped")
292
- cleanup_chromedriver_processes()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
293
  except Exception as e:
294
- logging.warning(f"Error cleaning up WebDriver: {str(e)}")
 
 
 
 
295
 
296
  @st.cache_resource
297
  def initialize_qa_model():
@@ -402,7 +221,7 @@ if app_mode == "Web Scraping":
402
  with st.expander("View scraped content"):
403
  st.text_area("Content", result['content'], height=300)
404
  else:
405
- st.error("Failed to scrape the website. Check logs for details.")
406
  else:
407
  st.warning("Please enter a valid URL (e.g., https://example.com).")
408
 
@@ -427,10 +246,10 @@ elif app_mode == "Chat with Content":
427
  elif app_mode == "About":
428
  st.header("ℹ️ About")
429
  st.markdown("""
430
- This app uses Selenium for web scraping, LangChain for vector storage with FAISS,
431
  and Hugging Face models for embeddings and question answering.
432
- - **Web Scraping**: Extracts text using headless Chromium.
433
  - **RAG**: Indexes content with sentence-transformers and answers questions using FLAN-T5.
434
- - **Tech Stack**: Python, Streamlit, Selenium, LangChain, Hugging Face Transformers, FAISS.
435
  - **Docker**: Runs in a containerized environment.
436
  """)
 
1
  import streamlit as st
2
  import os
3
  import re
 
 
 
4
  import logging
5
+ from playwright.sync_api import sync_playwright
 
 
 
 
 
 
 
6
  from langchain.text_splitter import RecursiveCharacterTextSplitter
7
  from langchain_community.vectorstores import FAISS
8
  from langchain_community.embeddings import HuggingFaceEmbeddings
9
  from langchain.schema import Document
 
10
 
11
  # Try importing transformers with fallback
12
  try:
 
27
 
28
  # Set up logging
29
  logging.basicConfig(
30
+ filename='/app/cache/app.log',
31
  level=logging.DEBUG,
32
  format='%(asctime)s - %(levelname)s - %(message)s'
33
  )
34
 
 
 
 
 
 
 
 
 
 
 
35
  # Set page configuration
36
  st.set_page_config(
37
  page_title="Web Scraping + RAG Chatbot",
 
44
  st.title("🕷️ Web Scraping + RAG Chatbot")
45
  st.markdown("""
46
  This app combines web scraping with Retrieval-Augmented Generation (RAG) to create an intelligent chatbot.
47
+ Enter a URL to scrape its content, then ask questions about the scraped data.
48
  """)
49
 
50
  # Initialize session state
 
57
  if 'qa_pipeline' not in st.session_state:
58
  st.session_state.qa_pipeline = None
59
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
60
  def clean_text(text):
61
  """Clean and normalize scraped text."""
62
  try:
 
68
  return text
69
 
70
  def scrape_website(url):
71
+ """Scrape data from the given URL using Playwright."""
72
  logging.info(f"Starting scrape for URL: {url}")
73
+ with sync_playwright() as p:
74
+ browser = p.chromium.launch(headless=True, args=['--no-sandbox', '--disable-dev-shm-usage'])
75
+ page = browser.new_page()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
76
  try:
77
+ logging.info(f"Navigating to {url}")
78
+ page.goto(url, wait_until="domcontentloaded", timeout=30000)
79
+ title = page.title()
80
+ content_selectors = [
81
+ "#content",
82
+ ".mw-parser-output",
83
+ "main",
84
+ ".main-content",
85
+ "#main",
86
+ "article"
87
+ ]
88
+ main_content = None
89
+ for selector in content_selectors:
90
+ try:
91
+ main_content = page.query_selector(selector)
92
+ if main_content:
93
+ logging.info(f"Found content with selector: {selector}")
94
+ break
95
+ except:
96
+ continue
97
+ if not main_content:
98
+ main_content = page.query_selector("body")
99
+ logging.info("Falling back to body tag for content")
100
+ text_content = main_content.inner_text()
101
+ cleaned_content = clean_text(text_content)
102
+ logging.info(f"Scraped {len(cleaned_content)} characters from {url}")
103
+ return {
104
+ "title": title,
105
+ "content": cleaned_content,
106
+ "url": url
107
+ }
108
  except Exception as e:
109
+ logging.error(f"Error scraping {url}: {str(e)}")
110
+ st.error(f"Error scraping {url}: {str(e)}")
111
+ return None
112
+ finally:
113
+ browser.close()
114
 
115
  @st.cache_resource
116
  def initialize_qa_model():
 
221
  with st.expander("View scraped content"):
222
  st.text_area("Content", result['content'], height=300)
223
  else:
224
+ st.error("Failed to scrape the website. Check logs at /app/cache/app.log.")
225
  else:
226
  st.warning("Please enter a valid URL (e.g., https://example.com).")
227
 
 
246
  elif app_mode == "About":
247
  st.header("ℹ️ About")
248
  st.markdown("""
249
+ This app uses Playwright for web scraping, LangChain for vector storage with FAISS,
250
  and Hugging Face models for embeddings and question answering.
251
+ - **Web Scraping**: Extracts text using headless Chromium via Playwright.
252
  - **RAG**: Indexes content with sentence-transformers and answers questions using FLAN-T5.
253
+ - **Tech Stack**: Python, Streamlit, Playwright, LangChain, Hugging Face Transformers, FAISS.
254
  - **Docker**: Runs in a containerized environment.
255
  """)