import streamlit as st
import asyncio
import subprocess
import sys
import os
from crawl4ai import AsyncWebCrawler
import nest_asyncio

# Apply nest_asyncio to allow nested event loops
nest_asyncio.apply()

st.set_page_config(page_title="Web Crawler App", layout="wide")

st.title("Web Crawler App")

# Check if we're on Hugging Face space
is_hf_space = os.environ.get("SPACE_ID") is not None

# Function to install dependencies
def install_playwright():
    try:
        st.info("Installing Playwright and browser dependencies. This may take a few minutes...")
        subprocess.check_call([sys.executable, "-m", "pip", "install", "playwright"])
        subprocess.check_call([sys.executable, "-m", "playwright", "install", "--with-deps", "chromium"])
        st.success("Playwright and Chromium installed successfully!")
        return True
    except Exception as e:
        st.error(f"Failed to install dependencies: {str(e)}")
        return False

# Display installation status section at the top for Hugging Face spaces
if is_hf_space:
    st.info("Running on Hugging Face Space. Make sure browser dependencies are installed.")
    
    if st.button("Install Browser Dependencies"):
        success = install_playwright()
        if success:
            st.info("Please restart the application after installation completes.")

# Function to check if playwright browser is available
def check_browser_installed():
    try:
        result = subprocess.run(
            [sys.executable, "-c", "from playwright.sync_api import sync_playwright; print('OK')"],
            capture_output=True,
            text=True
        )
        return "OK" in result.stdout
    except:
        return False

# Display browser status
browser_installed = check_browser_installed()
if browser_installed:
    st.success("Browser dependencies are installed.")
else:
    st.warning("Browser dependencies may not be installed correctly. Crawling might fail.")

# Input for URL
url = st.text_input("Enter URL to crawl:", value="https://www.nbcnews.com/business")

# Optional parameters
with st.expander("Advanced Options"):
    max_depth = st.slider("Max Crawl Depth", min_value=1, max_value=5, value=1)
    timeout = st.slider("Timeout (seconds)", min_value=10, max_value=120, value=30)
    max_pages = st.number_input("Max Pages to Crawl", min_value=1, max_value=100, value=10)
    crawl_mode = st.selectbox("Crawl Mode", options=["browser", "requests"], index=0,
                             help="'browser' uses Playwright (more capable but slower), 'requests' is faster but may miss content")

# Function to run the crawler
def run_async_crawler(url, max_depth=1, timeout=30, max_pages=10, mode="browser"):
    async def _run():
        try:
            async with AsyncWebCrawler() as crawler:
                result = await crawler.arun(
                    url=url,
                    max_depth=max_depth,
                    timeout=timeout,
                    max_pages=max_pages,
                    mode=mode
                )
                return result.markdown, None
        except Exception as e:
            return None, str(e)
    
    # Use the current event loop with nest_asyncio applied
    return asyncio.get_event_loop().run_until_complete(_run())

# Button to start crawling
if st.button("Start Crawling"):
    if not browser_installed and crawl_mode == "browser":
        st.warning("Browser dependencies not detected. Try installing them first or switch to 'requests' mode.")
    
    try:
        with st.spinner("Crawling in progress..."):
            result, error = run_async_crawler(
                url=url,
                max_depth=max_depth,
                timeout=timeout,
                max_pages=max_pages,
                mode=crawl_mode
            )
            
            if error:
                st.error(f"Crawling failed: {error}")
                
                if "browser" in error.lower():
                    st.error("This appears to be a browser-related error.")
                    if st.button("Attempt to install browser dependencies"):
                        install_playwright()
            else:
                # Display the results
                st.subheader("Crawl Results")
                st.markdown(result)
                
                # Option to download results
                st.download_button(
                    label="Download Results",
                    data=result,
                    file_name="crawl_results.md",
                    mime="text/markdown"
                )
    except Exception as e:
        st.error(f"An error occurred: {str(e)}")

# Add footer with information
st.markdown("---")
st.info("""
This app uses the crawl4ai library to extract content from web pages.

**Hugging Face Space Setup Instructions:**
1. After launching the space, click "Install Browser Dependencies"
2. Restart the space after installation completes
3. You should now be able to crawl websites with the browser mode
""")

# Add a sidebar with information about the dependencies
with st.sidebar:
    st.header("About")
    st.write("""
    This web crawler app requires Playwright and Chromium to be installed for the browser mode.
    
    If you're running on a Hugging Face space, use the "Install Browser Dependencies" button.
    
    If you're running locally, run:
    ```
    python -m playwright install --with-deps chromium
    ```
    """)