Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -1,25 +1,25 @@
|
|
| 1 |
import streamlit as st
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2 |
from selenium import webdriver
|
| 3 |
from selenium.webdriver.chrome.service import Service
|
| 4 |
from selenium.webdriver.chrome.options import Options
|
| 5 |
from selenium.webdriver.common.by import By
|
| 6 |
from selenium.webdriver.support.ui import WebDriverWait
|
| 7 |
from selenium.webdriver.support import expected_conditions as EC
|
|
|
|
| 8 |
from transformers import pipeline, AutoTokenizer, AutoModelForSeq2SeqLM
|
| 9 |
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
| 10 |
from langchain_community.vectorstores import FAISS
|
| 11 |
from langchain_community.embeddings import HuggingFaceEmbeddings
|
| 12 |
from langchain.schema import Document
|
| 13 |
-
import logging
|
| 14 |
-
import subprocess
|
| 15 |
-
import traceback
|
| 16 |
-
import os
|
| 17 |
-
import re
|
| 18 |
-
import time
|
| 19 |
-
import psutil
|
| 20 |
-
import tempfile
|
| 21 |
-
import shutil
|
| 22 |
-
from selenium.common.exceptions import WebDriverException, TimeoutException
|
| 23 |
|
| 24 |
# Set up logging
|
| 25 |
logging.basicConfig(
|
|
@@ -57,34 +57,24 @@ def cleanup_chromedriver_processes():
|
|
| 57 |
"""Kill any lingering ChromeDriver and Chrome processes."""
|
| 58 |
try:
|
| 59 |
for proc in psutil.process_iter(['name', 'cmdline']):
|
| 60 |
-
|
| 61 |
-
|
| 62 |
-
|
| 63 |
-
|
| 64 |
if any('type=renderer' in arg for arg in cmdline):
|
| 65 |
continue
|
| 66 |
proc.kill()
|
| 67 |
-
logging.info(f"Killed process {
|
| 68 |
-
|
| 69 |
-
|
| 70 |
except Exception as e:
|
| 71 |
logging.warning(f"Error cleaning up processes: {str(e)}")
|
| 72 |
|
| 73 |
-
def check_port(port):
|
| 74 |
-
"""Check if a port is in use."""
|
| 75 |
-
try:
|
| 76 |
-
result = subprocess.run(['netstat', '-tuln'], capture_output=True, text=True, timeout=5)
|
| 77 |
-
return f':{port}' in result.stdout
|
| 78 |
-
except Exception as e:
|
| 79 |
-
logging.warning(f"Error checking port {port}: {str(e)}")
|
| 80 |
-
return False
|
| 81 |
-
|
| 82 |
def find_binary(binary_name):
|
| 83 |
"""Find binary path."""
|
| 84 |
try:
|
| 85 |
result = subprocess.check_output(
|
| 86 |
-
['which', binary_name],
|
| 87 |
-
stderr=subprocess.DEVNULL
|
| 88 |
).decode().strip()
|
| 89 |
return result if result else None
|
| 90 |
except Exception as e:
|
|
@@ -109,7 +99,6 @@ def check_versions(chromium_path, chromedriver_path):
|
|
| 109 |
def setup_driver():
|
| 110 |
"""Set up Selenium WebDriver with unique user data directory."""
|
| 111 |
cleanup_chromedriver_processes()
|
| 112 |
-
|
| 113 |
chromium_path = os.getenv('CHROMIUM_PATH', '/usr/bin/chromium')
|
| 114 |
chromedriver_path = os.getenv('CHROMEDRIVER_PATH', '/usr/bin/chromedriver')
|
| 115 |
|
|
@@ -119,7 +108,6 @@ def setup_driver():
|
|
| 119 |
if not chromium_path:
|
| 120 |
st.error("Chromium not found. Please ensure it's installed.")
|
| 121 |
return None, None
|
| 122 |
-
|
| 123 |
if not os.path.exists(chromedriver_path):
|
| 124 |
chromedriver_path = find_binary('chromedriver')
|
| 125 |
if not chromedriver_path:
|
|
@@ -132,7 +120,6 @@ def setup_driver():
|
|
| 132 |
subprocess.run(['chmod', '+x', chromedriver_path], check=True)
|
| 133 |
except subprocess.CalledProcessError:
|
| 134 |
st.warning(f"Could not set executable permissions on {chromedriver_path}")
|
| 135 |
-
|
| 136 |
if not os.access(chromium_path, os.X_OK):
|
| 137 |
try:
|
| 138 |
subprocess.run(['chmod', '+x', chromium_path], check=True)
|
|
@@ -150,36 +137,25 @@ def setup_driver():
|
|
| 150 |
options.add_argument('--disable-extensions')
|
| 151 |
options.add_argument('--disable-background-networking')
|
| 152 |
options.add_argument('--window-size=1920,1080')
|
| 153 |
-
options.add_argument('--remote-debugging-port=0')
|
| 154 |
options.add_argument('--ignore-certificate-errors')
|
| 155 |
-
options.add_argument('--disable-web-security')
|
| 156 |
-
|
| 157 |
-
# Create unique user data directory
|
| 158 |
-
temp_dir = tempfile.mkdtemp()
|
| 159 |
-
options.add_argument(f"--user-data-dir={temp_dir}")
|
| 160 |
-
|
| 161 |
options.binary_location = chromium_path
|
| 162 |
|
| 163 |
-
#
|
| 164 |
-
|
| 165 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 166 |
max_attempts = 3
|
| 167 |
for attempt in range(max_attempts):
|
| 168 |
try:
|
| 169 |
-
if check_port(service.port):
|
| 170 |
-
logging.warning(f"Port {service.port} in use, stopping service")
|
| 171 |
-
service.stop()
|
| 172 |
-
time.sleep(1)
|
| 173 |
-
|
| 174 |
service.start()
|
| 175 |
driver = webdriver.Chrome(service=service, options=options)
|
| 176 |
driver.set_page_load_timeout(60)
|
| 177 |
-
|
| 178 |
-
# Store temp directory for cleanup
|
| 179 |
driver.temp_dir = temp_dir
|
| 180 |
-
logging.info(f"ChromeDriver initialized
|
| 181 |
return driver, service
|
| 182 |
-
|
| 183 |
except WebDriverException as e:
|
| 184 |
logging.warning(f"WebDriver init failed (attempt {attempt + 1}/{max_attempts}): {str(e)}")
|
| 185 |
if attempt < max_attempts - 1:
|
|
@@ -189,13 +165,12 @@ def setup_driver():
|
|
| 189 |
pass
|
| 190 |
time.sleep(2)
|
| 191 |
else:
|
| 192 |
-
# Clean up temp directory if creation failed
|
| 193 |
try:
|
| 194 |
-
shutil.rmtree(temp_dir)
|
| 195 |
except:
|
| 196 |
pass
|
| 197 |
logging.error(f"Failed to initialize WebDriver after {max_attempts} attempts: {str(e)}")
|
| 198 |
-
st.error(f"Failed to initialize WebDriver
|
| 199 |
return None, None
|
| 200 |
return None, None
|
| 201 |
|
|
@@ -211,18 +186,14 @@ def scrape_website(url):
|
|
| 211 |
if not driver or not service:
|
| 212 |
st.error("Failed to initialize WebDriver. Please check if Chromium and ChromeDriver are properly installed.")
|
| 213 |
return None
|
| 214 |
-
|
| 215 |
try:
|
| 216 |
logging.info(f"Attempting to scrape {url}")
|
| 217 |
driver.get(url)
|
| 218 |
WebDriverWait(driver, 30).until(
|
| 219 |
EC.presence_of_element_located((By.TAG_NAME, "body"))
|
| 220 |
)
|
| 221 |
-
|
| 222 |
-
# Get page title
|
| 223 |
title = driver.title
|
| 224 |
-
|
| 225 |
-
# Try multiple selectors for main content
|
| 226 |
content_selectors = [
|
| 227 |
(By.ID, "content"),
|
| 228 |
(By.CLASS_NAME, "mw-parser-output"),
|
|
@@ -231,7 +202,6 @@ def scrape_website(url):
|
|
| 231 |
(By.ID, "main"),
|
| 232 |
(By.TAG_NAME, "article")
|
| 233 |
]
|
| 234 |
-
|
| 235 |
main_content = None
|
| 236 |
for by, value in content_selectors:
|
| 237 |
try:
|
|
@@ -239,41 +209,29 @@ def scrape_website(url):
|
|
| 239 |
break
|
| 240 |
except:
|
| 241 |
continue
|
| 242 |
-
|
| 243 |
if not main_content:
|
| 244 |
main_content = driver.find_element(By.TAG_NAME, "body")
|
| 245 |
-
|
| 246 |
text_content = main_content.text
|
| 247 |
cleaned_content = clean_text(text_content)
|
| 248 |
logging.info(f"Scraped {len(cleaned_content)} characters from {url}")
|
| 249 |
-
|
| 250 |
return {
|
| 251 |
"title": title,
|
| 252 |
"content": cleaned_content,
|
| 253 |
"url": url
|
| 254 |
}
|
| 255 |
-
|
| 256 |
except (WebDriverException, TimeoutException) as e:
|
| 257 |
logging.error(f"Error scraping {url}: {str(e)}")
|
| 258 |
st.error(f"Error scraping {url}: {str(e)}")
|
| 259 |
return None
|
| 260 |
-
|
| 261 |
finally:
|
| 262 |
try:
|
| 263 |
driver.quit()
|
| 264 |
service.stop()
|
| 265 |
-
|
| 266 |
-
|
| 267 |
-
|
| 268 |
-
try:
|
| 269 |
-
shutil.rmtree(driver.temp_dir)
|
| 270 |
-
logging.info(f"Cleaned up temp directory: {driver.temp_dir}")
|
| 271 |
-
except Exception as e:
|
| 272 |
-
logging.warning(f"Error cleaning up temp directory: {str(e)}")
|
| 273 |
-
|
| 274 |
-
logging.info("WebDriver and service stopped")
|
| 275 |
except Exception as e:
|
| 276 |
-
logging.warning(f"Error
|
| 277 |
|
| 278 |
@st.cache_resource
|
| 279 |
def initialize_qa_model():
|
|
@@ -290,7 +248,7 @@ def initialize_qa_model():
|
|
| 290 |
tokenizer=tokenizer,
|
| 291 |
max_length=200
|
| 292 |
)
|
| 293 |
-
|
| 294 |
except Exception as e:
|
| 295 |
st.error(f"Failed to load QA model: {str(e)}")
|
| 296 |
logging.error(f"Error loading QA model: {str(e)}")
|
|
@@ -394,7 +352,8 @@ elif app_mode == "Chat with Content":
|
|
| 394 |
elif app_mode == "About":
|
| 395 |
st.header("ℹ️ About")
|
| 396 |
st.markdown("""
|
| 397 |
-
This app uses Selenium for web scraping, LangChain for vector storage with
|
|
|
|
| 398 |
- **Web Scraping**: Extracts text using headless Chromium.
|
| 399 |
- **RAG**: Indexes content with sentence-transformers and answers questions using FLAN-T5.
|
| 400 |
- **Tech Stack**: Python, Streamlit, Selenium, LangChain, Hugging Face Transformers, FAISS.
|
|
|
|
| 1 |
import streamlit as st
|
| 2 |
+
import os
|
| 3 |
+
import re
|
| 4 |
+
import time
|
| 5 |
+
import uuid
|
| 6 |
+
import subprocess
|
| 7 |
+
import tempfile
|
| 8 |
+
import shutil
|
| 9 |
+
import logging
|
| 10 |
+
import psutil
|
| 11 |
from selenium import webdriver
|
| 12 |
from selenium.webdriver.chrome.service import Service
|
| 13 |
from selenium.webdriver.chrome.options import Options
|
| 14 |
from selenium.webdriver.common.by import By
|
| 15 |
from selenium.webdriver.support.ui import WebDriverWait
|
| 16 |
from selenium.webdriver.support import expected_conditions as EC
|
| 17 |
+
from selenium.common.exceptions import WebDriverException, TimeoutException
|
| 18 |
from transformers import pipeline, AutoTokenizer, AutoModelForSeq2SeqLM
|
| 19 |
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
| 20 |
from langchain_community.vectorstores import FAISS
|
| 21 |
from langchain_community.embeddings import HuggingFaceEmbeddings
|
| 22 |
from langchain.schema import Document
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 23 |
|
| 24 |
# Set up logging
|
| 25 |
logging.basicConfig(
|
|
|
|
| 57 |
"""Kill any lingering ChromeDriver and Chrome processes."""
|
| 58 |
try:
|
| 59 |
for proc in psutil.process_iter(['name', 'cmdline']):
|
| 60 |
+
try:
|
| 61 |
+
name = proc.info['name'].lower()
|
| 62 |
+
cmdline = proc.info.get('cmdline', [])
|
| 63 |
+
if 'chromedriver' in name or 'chrome' in name or 'chromium' in name:
|
| 64 |
if any('type=renderer' in arg for arg in cmdline):
|
| 65 |
continue
|
| 66 |
proc.kill()
|
| 67 |
+
logging.info(f"Killed process {name} PID {proc.pid}")
|
| 68 |
+
except (psutil.NoSuchProcess, psutil.AccessDenied):
|
| 69 |
+
pass
|
| 70 |
except Exception as e:
|
| 71 |
logging.warning(f"Error cleaning up processes: {str(e)}")
|
| 72 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 73 |
def find_binary(binary_name):
|
| 74 |
"""Find binary path."""
|
| 75 |
try:
|
| 76 |
result = subprocess.check_output(
|
| 77 |
+
['which', binary_name], stderr=subprocess.DEVNULL
|
|
|
|
| 78 |
).decode().strip()
|
| 79 |
return result if result else None
|
| 80 |
except Exception as e:
|
|
|
|
| 99 |
def setup_driver():
|
| 100 |
"""Set up Selenium WebDriver with unique user data directory."""
|
| 101 |
cleanup_chromedriver_processes()
|
|
|
|
| 102 |
chromium_path = os.getenv('CHROMIUM_PATH', '/usr/bin/chromium')
|
| 103 |
chromedriver_path = os.getenv('CHROMEDRIVER_PATH', '/usr/bin/chromedriver')
|
| 104 |
|
|
|
|
| 108 |
if not chromium_path:
|
| 109 |
st.error("Chromium not found. Please ensure it's installed.")
|
| 110 |
return None, None
|
|
|
|
| 111 |
if not os.path.exists(chromedriver_path):
|
| 112 |
chromedriver_path = find_binary('chromedriver')
|
| 113 |
if not chromedriver_path:
|
|
|
|
| 120 |
subprocess.run(['chmod', '+x', chromedriver_path], check=True)
|
| 121 |
except subprocess.CalledProcessError:
|
| 122 |
st.warning(f"Could not set executable permissions on {chromedriver_path}")
|
|
|
|
| 123 |
if not os.access(chromium_path, os.X_OK):
|
| 124 |
try:
|
| 125 |
subprocess.run(['chmod', '+x', chromium_path], check=True)
|
|
|
|
| 137 |
options.add_argument('--disable-extensions')
|
| 138 |
options.add_argument('--disable-background-networking')
|
| 139 |
options.add_argument('--window-size=1920,1080')
|
|
|
|
| 140 |
options.add_argument('--ignore-certificate-errors')
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 141 |
options.binary_location = chromium_path
|
| 142 |
|
| 143 |
+
# Create unique user data directory with UUID
|
| 144 |
+
temp_dir = os.path.join(tempfile.gettempdir(), f"chrome-data-{uuid.uuid4()}")
|
| 145 |
+
os.makedirs(temp_dir, exist_ok=True)
|
| 146 |
+
options.add_argument(f"--user-data-dir={temp_dir}")
|
| 147 |
+
|
| 148 |
+
# Initialize ChromeDriver service with random port
|
| 149 |
+
service = Service(executable_path=chromedriver_path, port=0)
|
| 150 |
max_attempts = 3
|
| 151 |
for attempt in range(max_attempts):
|
| 152 |
try:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 153 |
service.start()
|
| 154 |
driver = webdriver.Chrome(service=service, options=options)
|
| 155 |
driver.set_page_load_timeout(60)
|
|
|
|
|
|
|
| 156 |
driver.temp_dir = temp_dir
|
| 157 |
+
logging.info(f"ChromeDriver initialized with temp dir: {temp_dir}")
|
| 158 |
return driver, service
|
|
|
|
| 159 |
except WebDriverException as e:
|
| 160 |
logging.warning(f"WebDriver init failed (attempt {attempt + 1}/{max_attempts}): {str(e)}")
|
| 161 |
if attempt < max_attempts - 1:
|
|
|
|
| 165 |
pass
|
| 166 |
time.sleep(2)
|
| 167 |
else:
|
|
|
|
| 168 |
try:
|
| 169 |
+
shutil.rmtree(temp_dir, ignore_errors=True)
|
| 170 |
except:
|
| 171 |
pass
|
| 172 |
logging.error(f"Failed to initialize WebDriver after {max_attempts} attempts: {str(e)}")
|
| 173 |
+
st.error(f"Failed to initialize WebDriver: {str(e)}")
|
| 174 |
return None, None
|
| 175 |
return None, None
|
| 176 |
|
|
|
|
| 186 |
if not driver or not service:
|
| 187 |
st.error("Failed to initialize WebDriver. Please check if Chromium and ChromeDriver are properly installed.")
|
| 188 |
return None
|
| 189 |
+
temp_dir = getattr(driver, 'temp_dir', None)
|
| 190 |
try:
|
| 191 |
logging.info(f"Attempting to scrape {url}")
|
| 192 |
driver.get(url)
|
| 193 |
WebDriverWait(driver, 30).until(
|
| 194 |
EC.presence_of_element_located((By.TAG_NAME, "body"))
|
| 195 |
)
|
|
|
|
|
|
|
| 196 |
title = driver.title
|
|
|
|
|
|
|
| 197 |
content_selectors = [
|
| 198 |
(By.ID, "content"),
|
| 199 |
(By.CLASS_NAME, "mw-parser-output"),
|
|
|
|
| 202 |
(By.ID, "main"),
|
| 203 |
(By.TAG_NAME, "article")
|
| 204 |
]
|
|
|
|
| 205 |
main_content = None
|
| 206 |
for by, value in content_selectors:
|
| 207 |
try:
|
|
|
|
| 209 |
break
|
| 210 |
except:
|
| 211 |
continue
|
|
|
|
| 212 |
if not main_content:
|
| 213 |
main_content = driver.find_element(By.TAG_NAME, "body")
|
|
|
|
| 214 |
text_content = main_content.text
|
| 215 |
cleaned_content = clean_text(text_content)
|
| 216 |
logging.info(f"Scraped {len(cleaned_content)} characters from {url}")
|
|
|
|
| 217 |
return {
|
| 218 |
"title": title,
|
| 219 |
"content": cleaned_content,
|
| 220 |
"url": url
|
| 221 |
}
|
|
|
|
| 222 |
except (WebDriverException, TimeoutException) as e:
|
| 223 |
logging.error(f"Error scraping {url}: {str(e)}")
|
| 224 |
st.error(f"Error scraping {url}: {str(e)}")
|
| 225 |
return None
|
|
|
|
| 226 |
finally:
|
| 227 |
try:
|
| 228 |
driver.quit()
|
| 229 |
service.stop()
|
| 230 |
+
if temp_dir:
|
| 231 |
+
shutil.rmtree(temp_dir, ignore_errors=True)
|
| 232 |
+
logging.info(f"Cleaned up temp directory: {temp_dir}")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 233 |
except Exception as e:
|
| 234 |
+
logging.warning(f"Error cleaning up WebDriver: {str(e)}")
|
| 235 |
|
| 236 |
@st.cache_resource
|
| 237 |
def initialize_qa_model():
|
|
|
|
| 248 |
tokenizer=tokenizer,
|
| 249 |
max_length=200
|
| 250 |
)
|
| 251 |
+
return st.session_state.qa_pipeline
|
| 252 |
except Exception as e:
|
| 253 |
st.error(f"Failed to load QA model: {str(e)}")
|
| 254 |
logging.error(f"Error loading QA model: {str(e)}")
|
|
|
|
| 352 |
elif app_mode == "About":
|
| 353 |
st.header("ℹ️ About")
|
| 354 |
st.markdown("""
|
| 355 |
+
This app uses Selenium for web scraping, LangChain for vector storage with FA的就是
|
| 356 |
+
FAISS, and Hugging Face models for embeddings and question answering.
|
| 357 |
- **Web Scraping**: Extracts text using headless Chromium.
|
| 358 |
- **RAG**: Indexes content with sentence-transformers and answers questions using FLAN-T5.
|
| 359 |
- **Tech Stack**: Python, Streamlit, Selenium, LangChain, Hugging Face Transformers, FAISS.
|