Spaces:

muddasser
/

Webscrapping_Playwright

Sleeping

App Files Files Community

muddasser commited on Aug 28, 2025

Commit

07f51e6

verified ·

1 Parent(s): a7b818e

Update app.py

Browse files

Files changed (1) hide show

app.py +101 -61

app.py CHANGED Viewed

@@ -3,7 +3,6 @@ import os
 import re
 import time
 import subprocess
-import tempfile
 import shutil
 import logging
 import psutil
@@ -20,21 +19,38 @@ from langchain_community.embeddings import HuggingFaceEmbeddings
 from langchain.schema import Document
 import chromedriver_autoinstaller
-# Try importing transformers with error handling
 try:
     from transformers import pipeline, AutoTokenizer, AutoModelForSeq2SeqLM
 except ImportError as e:
-    st.error(f"Failed to import transformers: {str(e)}. Please ensure transformers==4.44.2 is installed correctly.")
     logging.error(f"Transformers import failed: {str(e)}")
-    st.stop()
 # Set up logging
 logging.basicConfig(
     filename='/tmp/app.log',
-    level=logging.INFO,
     format='%(asctime)s - %(levelname)s - %(message)s'
 )
 # Set page configuration
 st.set_page_config(
     page_title="Web Scraping + RAG Chatbot",
@@ -72,27 +88,22 @@ def cleanup_chromedriver_processes():
                     if any('type=renderer' in arg for arg in cmdline):
                         continue
                     active_processes.append(f"{name} (PID {proc.pid})")
-                    proc.terminate()  # Try graceful termination first
-                    try:
-                        proc.wait(timeout=3)  # Wait for process to exit
-                    except psutil.TimeoutExpired:
-                        proc.kill()  # Force kill if it doesn't exit
                     logging.info(f"Terminated process {name} PID {proc.pid}")
-            except (psutil.NoSuchProcess, psutil.AccessDenied):
                 pass
         if active_processes:
             logging.info(f"Terminated processes: {', '.join(active_processes)}")
         else:
-            logging.info("No Chrome-related processes found to terminate")
     except Exception as e:
         logging.warning(f"Error cleaning up processes: {str(e)}")
 def find_binary(binary_name):
     """Find binary path."""
     try:
-        result = subprocess.check_output(
-            ['which', binary_name], stderr=subprocess.DEVNULL
-        ).decode().strip()
         logging.info(f"Found {binary_name} at: {result}")
         return result if result else None
     except Exception as e:
@@ -124,6 +135,7 @@ def check_disk_space():
 def setup_driver():
     """Set up Selenium WebDriver."""
     cleanup_chromedriver_processes()
     check_disk_space()
     chromium_path = os.getenv('CHROMIUM_PATH', '/usr/bin/chromium')
@@ -186,45 +198,56 @@ def setup_driver():
     options.add_argument('--ignore-certificate-errors')
     options.binary_location = chromium_path
-    # Initialize ChromeDriver service with random port
-    service = Service(executable_path=chromedriver_path, port=0)
-    max_attempts = 5
-    for attempt in range(max_attempts):
-        try:
-            logging.info(f"Attempt {attempt + 1}/{max_attempts} to start ChromeDriver")
-            service.start()
-            driver = webdriver.Chrome(service=service, options=options)
-            driver.set_page_load_timeout(60)
-            logging.info(f"ChromeDriver initialized successfully on attempt {attempt + 1}")
-            return driver, service
-        except WebDriverException as e:
-            logging.warning(f"WebDriver init failed (attempt {attempt + 1}/{max_attempts}): {str(e)}")
-            if attempt < max_attempts - 1:
-                try:
-                    service.stop()
-                except:
-                    pass
-                time.sleep(3)
-            else:
-                logging.error(f"Failed to initialize WebDriver after {max_attempts} attempts: {str(e)}")
-                st.error(f"Failed to initialize WebDriver: {str(e)}")
-                return None, None
     return None, None
 def clean_text(text):
     """Clean and normalize scraped text."""
-    text = re.sub(r'\s+', ' ', text)
-    text = re.sub(r'[^\w\s.,!?;:]', ' ', text)
-    return text.strip()
 def scrape_website(url):
     """Scrape data from the given URL."""
     driver, service = setup_driver()
     if not driver or not service:
-        st.error("Failed to initialize WebDriver. Please check if Chromium and ChromeDriver are properly installed.")
         return None
     try:
-        logging.info(f"Attempting to scrape {url}")
         driver.get(url)
         WebDriverWait(driver, 30).until(
             EC.presence_of_element_located((By.TAG_NAME, "body"))
@@ -242,11 +265,13 @@ def scrape_website(url):
         for by, value in content_selectors:
             try:
                 main_content = driver.find_element(by, value)
                 break
             except:
                 continue
         if not main_content:
             main_content = driver.find_element(By.TAG_NAME, "body")
         text_content = main_content.text
         cleaned_content = clean_text(text_content)
         logging.info(f"Scraped {len(cleaned_content)} characters from {url}")
@@ -270,20 +295,25 @@ def scrape_website(url):
 @st.cache_resource
 def initialize_qa_model():
-    """Initialize the QA model."""
     if st.session_state.qa_pipeline is None:
         try:
             with st.spinner("Loading FLAN-T5 model..."):
                 model_name = "google/flan-t5-small"
                 tokenizer = AutoTokenizer.from_pretrained(model_name)
                 model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
-                st.session_state.qa_pipeline = pipeline(
-                    "text2text-generation",
-                    model=model,
-                    tokenizer=tokenizer,
-                    max_length=200
-                )
-            return st.session_state.qa_pipeline
         except Exception as e:
             st.error(f"Failed to load QA model: {str(e)}")
             logging.error(f"Error loading QA model: {str(e)}")
@@ -305,6 +335,7 @@ def create_vector_store(text):
             model_kwargs={'device': 'cpu'}
         )
         vector_store = FAISS.from_documents(documents, embeddings)
         return vector_store
     except Exception as e:
         st.error(f"Error creating vector store: {str(e)}")
@@ -312,7 +343,7 @@ def create_vector_store(text):
         return None
 def answer_question(question):
-    """Answer a question using RAG."""
     if st.session_state.vector_store is None:
         return "Please scrape a website first."
     if st.session_state.qa_pipeline is None:
@@ -326,13 +357,22 @@ def answer_question(question):
         Question: {question}
         Answer:
         """
-        result = st.session_state.qa_pipeline(
-            prompt,
-            max_length=200,
-            do_sample=False,
-            temperature=0.1
-        )
-        return result[0]['generated_text'].strip()
     except Exception as e:
         logging.error(f"Error answering question: {str(e)}")
         return f"Error generating answer: {str(e)}"
@@ -348,7 +388,7 @@ app_mode = st.sidebar.radio("Choose a mode", ["Web Scraping", "Chat with Content
 if app_mode == "Web Scraping":
     st.header("🌐 Web Scraping")
-    url = st.text_input("Enter URL to scrape", "https://en.wikipedia.org/wiki/Artificial_intelligence")
     if st.button("Scrape Website"):
         if url and is_valid_url(url):
             with st.spinner("Scraping website..."):
@@ -364,7 +404,7 @@ if app_mode == "Web Scraping":
                 else:
                     st.error("Failed to scrape the website. Check logs for details.")
         else:
-            st.warning("Please enter a valid URL (e.g., https://en.wikipedia.org/wiki/Artificial_intelligence).")
 elif app_mode == "Chat with Content":
     st.header("💬 Chat with Scraped Content")

 import re
 import time
 import subprocess
 import shutil
 import logging
 import psutil
 from langchain.schema import Document
 import chromedriver_autoinstaller
+# Try importing transformers with fallback
 try:
     from transformers import pipeline, AutoTokenizer, AutoModelForSeq2SeqLM
+    import transformers
+    logging.info(f"Transformers version: {transformers.__version__}")
 except ImportError as e:
+    st.error(f"Failed to import transformers: {str(e)}. Attempting fallback without pipeline.")
     logging.error(f"Transformers import failed: {str(e)}")
+    try:
+        from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
+        import transformers
+        logging.info(f"Fallback: Imported AutoTokenizer and AutoModelForSeq2SeqLM, version: {transformers.__version__}")
+    except ImportError as e:
+        st.error(f"Failed to import transformers fallback: {str(e)}. Please ensure transformers==4.44.2 and tokenizers==0.19.1 are installed.")
+        logging.error(f"Transformers fallback import failed: {str(e)}")
+        st.stop()
 # Set up logging
 logging.basicConfig(
     filename='/tmp/app.log',
+    level=logging.DEBUG,  # Increased verbosity
     format='%(asctime)s - %(levelname)s - %(message)s'
 )
+# Health check
+logging.info("Starting application health check")
+try:
+    logging.info(f"Python version: {subprocess.check_output(['python', '--version']).decode().strip()}")
+    logging.info(f"Pip list: {subprocess.check_output(['pip', 'list']).decode()}")
+except Exception as e:
+    logging.error(f"Health check failed: {str(e)}")
 # Set page configuration
 st.set_page_config(
     page_title="Web Scraping + RAG Chatbot",
                     if any('type=renderer' in arg for arg in cmdline):
                         continue
                     active_processes.append(f"{name} (PID {proc.pid})")
+                    proc.terminate()
+                    proc.wait(timeout=3)
                     logging.info(f"Terminated process {name} PID {proc.pid}")
+            except (psutil.NoSuchProcess, psutil.AccessDenied, psutil.TimeoutExpired):
                 pass
         if active_processes:
             logging.info(f"Terminated processes: {', '.join(active_processes)}")
         else:
+            logging.info("No Chrome-related processes found")
     except Exception as e:
         logging.warning(f"Error cleaning up processes: {str(e)}")
 def find_binary(binary_name):
     """Find binary path."""
     try:
+        result = subprocess.check_output(['which', binary_name], stderr=subprocess.DEVNULL).decode().strip()
         logging.info(f"Found {binary_name} at: {result}")
         return result if result else None
     except Exception as e:
 def setup_driver():
     """Set up Selenium WebDriver."""
+    logging.info("Setting up Selenium WebDriver")
     cleanup_chromedriver_processes()
     check_disk_space()
     chromium_path = os.getenv('CHROMIUM_PATH', '/usr/bin/chromium')
     options.add_argument('--ignore-certificate-errors')
     options.binary_location = chromium_path
+    # Initialize ChromeDriver
+    try:
+        service = Service(executable_path=chromedriver_path, port=0)
+        max_attempts = 3
+        for attempt in range(max_attempts):
+            try:
+                logging.info(f"Attempt {attempt + 1}/{max_attempts} to start ChromeDriver")
+                service.start()
+                driver = webdriver.Chrome(service=service, options=options)
+                driver.set_page_load_timeout(60)
+                logging.info(f"ChromeDriver initialized successfully on attempt {attempt + 1}")
+                return driver, service
+            except WebDriverException as e:
+                logging.warning(f"WebDriver init failed (attempt {attempt + 1}/{max_attempts}): {str(e)}")
+                if attempt < max_attempts - 1:
+                    try:
+                        service.stop()
+                    except:
+                        pass
+                    time.sleep(2)
+                else:
+                    logging.error(f"Failed to initialize WebDriver after {max_attempts} attempts: {str(e)}")
+                    st.error(f"Failed to initialize WebDriver: {str(e)}")
+                    return None, None
+    except Exception as e:
+        logging.error(f"Error initializing ChromeDriver service: {str(e)}")
+        st.error(f"Failed to initialize ChromeDriver service: {str(e)}")
+        return None, None
     return None, None
 def clean_text(text):
     """Clean and normalize scraped text."""
+    try:
+        text = re.sub(r'\s+', ' ', text)
+        text = re.sub(r'[^\w\s.,!?;:]', ' ', text)
+        return text.strip()
+    except Exception as e:
+        logging.error(f"Error cleaning text: {str(e)}")
+        return text
 def scrape_website(url):
     """Scrape data from the given URL."""
+    logging.info(f"Starting scrape for URL: {url}")
     driver, service = setup_driver()
     if not driver or not service:
+        st.error("Failed to initialize WebDriver. Please check if Chromium and ChromeDriver are installed.")
+        logging.error("WebDriver initialization failed")
         return None
     try:
+        logging.info(f"Navigating to {url}")
         driver.get(url)
         WebDriverWait(driver, 30).until(
             EC.presence_of_element_located((By.TAG_NAME, "body"))
         for by, value in content_selectors:
             try:
                 main_content = driver.find_element(by, value)
+                logging.info(f"Found content with selector: {by}={value}")
                 break
             except:
                 continue
         if not main_content:
             main_content = driver.find_element(By.TAG_NAME, "body")
+            logging.info("Falling back to body tag for content")
         text_content = main_content.text
         cleaned_content = clean_text(text_content)
         logging.info(f"Scraped {len(cleaned_content)} characters from {url}")
 @st.cache_resource
 def initialize_qa_model():
+    """Initialize the QA model with fallback."""
     if st.session_state.qa_pipeline is None:
         try:
             with st.spinner("Loading FLAN-T5 model..."):
                 model_name = "google/flan-t5-small"
                 tokenizer = AutoTokenizer.from_pretrained(model_name)
                 model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
+                try:
+                    st.session_state.qa_pipeline = pipeline(
+                        "text2text-generation",
+                        model=model,
+                        tokenizer=tokenizer,
+                        max_length=200
+                    )
+                    logging.info("Initialized QA pipeline successfully")
+                except NameError:
+                    logging.warning("Pipeline not available, using raw model and tokenizer")
+                    st.session_state.qa_pipeline = (model, tokenizer)
+                return st.session_state.qa_pipeline
         except Exception as e:
             st.error(f"Failed to load QA model: {str(e)}")
             logging.error(f"Error loading QA model: {str(e)}")
             model_kwargs={'device': 'cpu'}
         )
         vector_store = FAISS.from_documents(documents, embeddings)
+        logging.info("FAISS vector store created successfully")
         return vector_store
     except Exception as e:
         st.error(f"Error creating vector store: {str(e)}")
         return None
 def answer_question(question):
+    """Answer a question using RAG with fallback."""
     if st.session_state.vector_store is None:
         return "Please scrape a website first."
     if st.session_state.qa_pipeline is None:
         Question: {question}
         Answer:
         """
+        if isinstance(st.session_state.qa_pipeline, tuple):
+            # Fallback: Use raw model and tokenizer
+            model, tokenizer = st.session_state.qa_pipeline
+            inputs = tokenizer(prompt, return_tensors="pt", max_length=512, truncation=True)
+            outputs = model.generate(**inputs, max_length=200, do_sample=False, temperature=0.1)
+            answer = tokenizer.decode(outputs[0], skip_special_tokens=True)
+        else:
+            # Use pipeline
+            result = st.session_state.qa_pipeline(
+                prompt,
+                max_length=200,
+                do_sample=False,
+                temperature=0.1
+            )
+            answer = result[0]['generated_text']
+        return answer.strip()
     except Exception as e:
         logging.error(f"Error answering question: {str(e)}")
         return f"Error generating answer: {str(e)}"
 if app_mode == "Web Scraping":
     st.header("🌐 Web Scraping")
+    url = st.text_input("Enter URL to scrape", "https://example.com")
     if st.button("Scrape Website"):
         if url and is_valid_url(url):
             with st.spinner("Scraping website..."):
                 else:
                     st.error("Failed to scrape the website. Check logs for details.")
         else:
+            st.warning("Please enter a valid URL (e.g., https://example.com).")
 elif app_mode == "Chat with Content":
     st.header("💬 Chat with Scraped Content")