Spaces:

muddasser
/

Webscrapping_Playwright

Sleeping

App Files Files Community

muddasser commited on Aug 28, 2025

Commit

3641b91

verified ·

1 Parent(s): f00e167

Update app.py

Browse files

Files changed (1) hide show

app.py +36 -77

app.py CHANGED Viewed

@@ -1,25 +1,25 @@
 import streamlit as st
 from selenium import webdriver
 from selenium.webdriver.chrome.service import Service
 from selenium.webdriver.chrome.options import Options
 from selenium.webdriver.common.by import By
 from selenium.webdriver.support.ui import WebDriverWait
 from selenium.webdriver.support import expected_conditions as EC
 from transformers import pipeline, AutoTokenizer, AutoModelForSeq2SeqLM
 from langchain.text_splitter import RecursiveCharacterTextSplitter
 from langchain_community.vectorstores import FAISS
 from langchain_community.embeddings import HuggingFaceEmbeddings
 from langchain.schema import Document
-import logging
-import subprocess
-import traceback
-import os
-import re
-import time
-import psutil
-import tempfile
-import shutil
-from selenium.common.exceptions import WebDriverException, TimeoutException
 # Set up logging
 logging.basicConfig(
@@ -57,34 +57,24 @@ def cleanup_chromedriver_processes():
     """Kill any lingering ChromeDriver and Chrome processes."""
     try:
         for proc in psutil.process_iter(['name', 'cmdline']):
-            if proc.info['name'] in ['chromedriver', 'chrome', 'chromium', 'chromium-browser']:
-                try:
-                    # Skip renderer processes to avoid system issues
-                    cmdline = proc.info.get('cmdline', [])
                     if any('type=renderer' in arg for arg in cmdline):
                         continue
                     proc.kill()
-                    logging.info(f"Killed process {proc.info['name']} PID {proc.pid}")
-                except (psutil.NoSuchProcess, psutil.AccessDenied):
-                    pass
     except Exception as e:
         logging.warning(f"Error cleaning up processes: {str(e)}")
-def check_port(port):
-    """Check if a port is in use."""
-    try:
-        result = subprocess.run(['netstat', '-tuln'], capture_output=True, text=True, timeout=5)
-        return f':{port}' in result.stdout
-    except Exception as e:
-        logging.warning(f"Error checking port {port}: {str(e)}")
-        return False
 def find_binary(binary_name):
     """Find binary path."""
     try:
         result = subprocess.check_output(
-            ['which', binary_name],
-            stderr=subprocess.DEVNULL
         ).decode().strip()
         return result if result else None
     except Exception as e:
@@ -109,7 +99,6 @@ def check_versions(chromium_path, chromedriver_path):
 def setup_driver():
     """Set up Selenium WebDriver with unique user data directory."""
     cleanup_chromedriver_processes()
     chromium_path = os.getenv('CHROMIUM_PATH', '/usr/bin/chromium')
     chromedriver_path = os.getenv('CHROMEDRIVER_PATH', '/usr/bin/chromedriver')
@@ -119,7 +108,6 @@ def setup_driver():
         if not chromium_path:
             st.error("Chromium not found. Please ensure it's installed.")
             return None, None
     if not os.path.exists(chromedriver_path):
         chromedriver_path = find_binary('chromedriver')
         if not chromedriver_path:
@@ -132,7 +120,6 @@ def setup_driver():
             subprocess.run(['chmod', '+x', chromedriver_path], check=True)
         except subprocess.CalledProcessError:
             st.warning(f"Could not set executable permissions on {chromedriver_path}")
     if not os.access(chromium_path, os.X_OK):
         try:
             subprocess.run(['chmod', '+x', chromium_path], check=True)
@@ -150,36 +137,25 @@ def setup_driver():
     options.add_argument('--disable-extensions')
     options.add_argument('--disable-background-networking')
     options.add_argument('--window-size=1920,1080')
-    options.add_argument('--remote-debugging-port=0')
     options.add_argument('--ignore-certificate-errors')
-    options.add_argument('--disable-web-security')
-    # Create unique user data directory
-    temp_dir = tempfile.mkdtemp()
-    options.add_argument(f"--user-data-dir={temp_dir}")
     options.binary_location = chromium_path
-    # Initialize ChromeDriver service
-    service = Service(executable_path=chromedriver_path)
     max_attempts = 3
     for attempt in range(max_attempts):
         try:
-            if check_port(service.port):
-                logging.warning(f"Port {service.port} in use, stopping service")
-                service.stop()
-                time.sleep(1)
             service.start()
             driver = webdriver.Chrome(service=service, options=options)
             driver.set_page_load_timeout(60)
-            # Store temp directory for cleanup
             driver.temp_dir = temp_dir
-            logging.info(f"ChromeDriver initialized on port {service.port} with temp dir: {temp_dir}")
             return driver, service
         except WebDriverException as e:
             logging.warning(f"WebDriver init failed (attempt {attempt + 1}/{max_attempts}): {str(e)}")
             if attempt < max_attempts - 1:
@@ -189,13 +165,12 @@ def setup_driver():
                     pass
                 time.sleep(2)
             else:
-                # Clean up temp directory if creation failed
                 try:
-                    shutil.rmtree(temp_dir)
                 except:
                     pass
                 logging.error(f"Failed to initialize WebDriver after {max_attempts} attempts: {str(e)}")
-                st.error(f"Failed to initialize WebDriver after {max_attempts} attempts: {str(e)}")
                 return None, None
     return None, None
@@ -211,18 +186,14 @@ def scrape_website(url):
     if not driver or not service:
         st.error("Failed to initialize WebDriver. Please check if Chromium and ChromeDriver are properly installed.")
         return None
     try:
         logging.info(f"Attempting to scrape {url}")
         driver.get(url)
         WebDriverWait(driver, 30).until(
             EC.presence_of_element_located((By.TAG_NAME, "body"))
         )
-        # Get page title
         title = driver.title
-        # Try multiple selectors for main content
         content_selectors = [
             (By.ID, "content"),
             (By.CLASS_NAME, "mw-parser-output"),
@@ -231,7 +202,6 @@ def scrape_website(url):
             (By.ID, "main"),
             (By.TAG_NAME, "article")
         ]
         main_content = None
         for by, value in content_selectors:
             try:
@@ -239,41 +209,29 @@ def scrape_website(url):
                 break
             except:
                 continue
         if not main_content:
             main_content = driver.find_element(By.TAG_NAME, "body")
         text_content = main_content.text
         cleaned_content = clean_text(text_content)
         logging.info(f"Scraped {len(cleaned_content)} characters from {url}")
         return {
             "title": title,
             "content": cleaned_content,
             "url": url
         }
     except (WebDriverException, TimeoutException) as e:
         logging.error(f"Error scraping {url}: {str(e)}")
         st.error(f"Error scraping {url}: {str(e)}")
         return None
     finally:
         try:
             driver.quit()
             service.stop()
-            # Clean up temporary directory
-            if hasattr(driver, 'temp_dir'):
-                try:
-                    shutil.rmtree(driver.temp_dir)
-                    logging.info(f"Cleaned up temp directory: {driver.temp_dir}")
-                except Exception as e:
-                    logging.warning(f"Error cleaning up temp directory: {str(e)}")
-            logging.info("WebDriver and service stopped")
         except Exception as e:
-            logging.warning(f"Error quitting driver: {str(e)}")
 @st.cache_resource
 def initialize_qa_model():
@@ -290,7 +248,7 @@ def initialize_qa_model():
                     tokenizer=tokenizer,
                     max_length=200
                 )
-                return st.session_state.qa_pipeline
         except Exception as e:
             st.error(f"Failed to load QA model: {str(e)}")
             logging.error(f"Error loading QA model: {str(e)}")
@@ -394,7 +352,8 @@ elif app_mode == "Chat with Content":
 elif app_mode == "About":
     st.header("ℹ️ About")
     st.markdown("""
-    This app uses Selenium for web scraping, LangChain for vector storage with FAISS, and Hugging Face models for embeddings and question answering.
     - **Web Scraping**: Extracts text using headless Chromium.
     - **RAG**: Indexes content with sentence-transformers and answers questions using FLAN-T5.
     - **Tech Stack**: Python, Streamlit, Selenium, LangChain, Hugging Face Transformers, FAISS.

 import streamlit as st
+import os
+import re
+import time
+import uuid
+import subprocess
+import tempfile
+import shutil
+import logging
+import psutil
 from selenium import webdriver
 from selenium.webdriver.chrome.service import Service
 from selenium.webdriver.chrome.options import Options
 from selenium.webdriver.common.by import By
 from selenium.webdriver.support.ui import WebDriverWait
 from selenium.webdriver.support import expected_conditions as EC
+from selenium.common.exceptions import WebDriverException, TimeoutException
 from transformers import pipeline, AutoTokenizer, AutoModelForSeq2SeqLM
 from langchain.text_splitter import RecursiveCharacterTextSplitter
 from langchain_community.vectorstores import FAISS
 from langchain_community.embeddings import HuggingFaceEmbeddings
 from langchain.schema import Document
 # Set up logging
 logging.basicConfig(
     """Kill any lingering ChromeDriver and Chrome processes."""
     try:
         for proc in psutil.process_iter(['name', 'cmdline']):
+            try:
+                name = proc.info['name'].lower()
+                cmdline = proc.info.get('cmdline', [])
+                if 'chromedriver' in name or 'chrome' in name or 'chromium' in name:
                     if any('type=renderer' in arg for arg in cmdline):
                         continue
                     proc.kill()
+                    logging.info(f"Killed process {name} PID {proc.pid}")
+            except (psutil.NoSuchProcess, psutil.AccessDenied):
+                pass
     except Exception as e:
         logging.warning(f"Error cleaning up processes: {str(e)}")
 def find_binary(binary_name):
     """Find binary path."""
     try:
         result = subprocess.check_output(
+            ['which', binary_name], stderr=subprocess.DEVNULL
         ).decode().strip()
         return result if result else None
     except Exception as e:
 def setup_driver():
     """Set up Selenium WebDriver with unique user data directory."""
     cleanup_chromedriver_processes()
     chromium_path = os.getenv('CHROMIUM_PATH', '/usr/bin/chromium')
     chromedriver_path = os.getenv('CHROMEDRIVER_PATH', '/usr/bin/chromedriver')
         if not chromium_path:
             st.error("Chromium not found. Please ensure it's installed.")
             return None, None
     if not os.path.exists(chromedriver_path):
         chromedriver_path = find_binary('chromedriver')
         if not chromedriver_path:
             subprocess.run(['chmod', '+x', chromedriver_path], check=True)
         except subprocess.CalledProcessError:
             st.warning(f"Could not set executable permissions on {chromedriver_path}")
     if not os.access(chromium_path, os.X_OK):
         try:
             subprocess.run(['chmod', '+x', chromium_path], check=True)
     options.add_argument('--disable-extensions')
     options.add_argument('--disable-background-networking')
     options.add_argument('--window-size=1920,1080')
     options.add_argument('--ignore-certificate-errors')
     options.binary_location = chromium_path
+    # Create unique user data directory with UUID
+    temp_dir = os.path.join(tempfile.gettempdir(), f"chrome-data-{uuid.uuid4()}")
+    os.makedirs(temp_dir, exist_ok=True)
+    options.add_argument(f"--user-data-dir={temp_dir}")
+    # Initialize ChromeDriver service with random port
+    service = Service(executable_path=chromedriver_path, port=0)
     max_attempts = 3
     for attempt in range(max_attempts):
         try:
             service.start()
             driver = webdriver.Chrome(service=service, options=options)
             driver.set_page_load_timeout(60)
             driver.temp_dir = temp_dir
+            logging.info(f"ChromeDriver initialized with temp dir: {temp_dir}")
             return driver, service
         except WebDriverException as e:
             logging.warning(f"WebDriver init failed (attempt {attempt + 1}/{max_attempts}): {str(e)}")
             if attempt < max_attempts - 1:
                     pass
                 time.sleep(2)
             else:
                 try:
+                    shutil.rmtree(temp_dir, ignore_errors=True)
                 except:
                     pass
                 logging.error(f"Failed to initialize WebDriver after {max_attempts} attempts: {str(e)}")
+                st.error(f"Failed to initialize WebDriver: {str(e)}")
                 return None, None
     return None, None
     if not driver or not service:
         st.error("Failed to initialize WebDriver. Please check if Chromium and ChromeDriver are properly installed.")
         return None
+    temp_dir = getattr(driver, 'temp_dir', None)
     try:
         logging.info(f"Attempting to scrape {url}")
         driver.get(url)
         WebDriverWait(driver, 30).until(
             EC.presence_of_element_located((By.TAG_NAME, "body"))
         )
         title = driver.title
         content_selectors = [
             (By.ID, "content"),
             (By.CLASS_NAME, "mw-parser-output"),
             (By.ID, "main"),
             (By.TAG_NAME, "article")
         ]
         main_content = None
         for by, value in content_selectors:
             try:
                 break
             except:
                 continue
         if not main_content:
             main_content = driver.find_element(By.TAG_NAME, "body")
         text_content = main_content.text
         cleaned_content = clean_text(text_content)
         logging.info(f"Scraped {len(cleaned_content)} characters from {url}")
         return {
             "title": title,
             "content": cleaned_content,
             "url": url
         }
     except (WebDriverException, TimeoutException) as e:
         logging.error(f"Error scraping {url}: {str(e)}")
         st.error(f"Error scraping {url}: {str(e)}")
         return None
     finally:
         try:
             driver.quit()
             service.stop()
+            if temp_dir:
+                shutil.rmtree(temp_dir, ignore_errors=True)
+                logging.info(f"Cleaned up temp directory: {temp_dir}")
         except Exception as e:
+            logging.warning(f"Error cleaning up WebDriver: {str(e)}")
 @st.cache_resource
 def initialize_qa_model():
                     tokenizer=tokenizer,
                     max_length=200
                 )
+            return st.session_state.qa_pipeline
         except Exception as e:
             st.error(f"Failed to load QA model: {str(e)}")
             logging.error(f"Error loading QA model: {str(e)}")
 elif app_mode == "About":
     st.header("ℹ️ About")
     st.markdown("""
+    This app uses Selenium for web scraping, LangChain for vector storage with FA的就是
+    FAISS, and Hugging Face models for embeddings and question answering.
     - **Web Scraping**: Extracts text using headless Chromium.
     - **RAG**: Indexes content with sentence-transformers and answers questions using FLAN-T5.
     - **Tech Stack**: Python, Streamlit, Selenium, LangChain, Hugging Face Transformers, FAISS.