Spaces:

spunteam
/

streamlit-web-crawler

Sleeping

File size: 5,308 Bytes

import streamlit as st
from typing import Dict, Any 
import requests
from models import LinkNode, Status
from typing import Dict, Any
import os
from dotenv import load_dotenv

load_dotenv()

def display_map(link_map: Dict[str, Any]):
    """
    Displays the entire link map in collapsible Streamlit expanders.
    If a link is not relevant based on its overview, it's tagged with a red icon.
    """
    st.header("🌐 Full Exploration Map")

    if not link_map:
        st.info("The exploration map is empty.")
        return

    validated_map = {}
    for href, dict_node in link_map.items():
        try:
            node = LinkNode.model_validate(dict_node)
            validated_map[href] = node
        except Exception as e:
            st.error(f"Failed to validate data for {href}. Skipping. Error: {e}")
            continue

    sorted_map = sorted(validated_map.items(), key=lambda item: item[1].depth)

    for href, node in sorted_map:
        st.divider() 
        st.subheader(f"📄 [{href}]({href})")
        if node.parent:
            st.caption(f"Found on: {node.parent}")

        status = node.overview.status
        if status == Status.RELEVANT:
            st.success(f"**Status: RELEVANT** ✅")
        elif status == Status.IRRELEVANT:
            st.warning(f"**Status: IRRELEVANT** ⚠️ - Page deemed not relevant to search criteria.")
        elif status == Status.FAILED:
            st.error(f"**Status: FAILED** ❌ - Could not scrape or analyze this page.")
        else:
            st.info(f"**Status: UNKNOWN** 🟡")

        st.markdown("**📝 Summary**")
        st.info(node.overview.summary)

        with st.expander("View Full Extracted Data and Found Links"):
            st.markdown("##### 📋 Full Extracted Data")
            overview_data = node.overview.model_dump()
            
            display_order = ['details', 'required_docs', 'price', 'SLA']
            
            items_to_display = []
            for key in display_order:
                value = overview_data.get(key)
                if value:
                    title = key.replace('_', ' ').capitalize()
                    items_to_display.append((title, str(value)))

            for i, (title, value) in enumerate(items_to_display):
                st.markdown(f"**{title}**")
                st.markdown(value)
                if i < len(items_to_display) - 1: 
                    st.markdown("---")            
            st.markdown("##### 🔗 Links Found on This Page")
            if node.child:
                st.write(f"Found **{len(node.child)}** link(s):")
                links_text = "\n".join(f"- {link}" for link in node.child)
                st.text_area("Links", links_text, height=150, key=f"links_{href}")
            else:
                st.write("No valid links were found on this page.")

def main():
    st.title("🤖 Browser Agent: Visa Data Extractor (Streamlit Demo)")
    st.markdown("Enter an API Key and a URL to start a recursive web crawl for structured visa information.")

    with st.sidebar:
        st.header("Configuration")

        default_url = "https://www.netherlandsworldwide.nl/visa-the-netherlands/visa-application-form"
        
        url = st.text_input("Starting URL (e.g., website.com)", default_url)

        max_depth = st.slider("Max Exploration Depth", min_value=1, max_value=5, value=1)
        
        st.markdown("""
        **Note:** Depth 1 is fast. Depth 2 or 3 can be **very slow** and consume many tokens.
        """)

    # --- Main Execution ---
    if st.button("Start Exploration and Extraction"):
        print(f"starting crawl for {url} with depth {max_depth}")
        if not url:
            st.error("Please enter a valid Starting URL.")
            return

        with st.spinner(f"Crawling {url} up to depth {max_depth}... (This may take a while)"):
            BASE_URI = os.getenv("BASE_URI", "http://localhost:5000")
            print(f"{BASE_URI}/scrape")
            try:
                result = requests.post(
                    f"{BASE_URI}/scrape",
                    headers={"Content-Type": "application/json"},
                    json={
                        "url": url,
                        "max_depth": max_depth
                    }
                )
            except requests.exceptions.ConnectionError:
                 st.error(f"Connection Error: Could not connect to the Flask API at {BASE_URI}. Please ensure your Flask app is running (e.g., `flask run`).")
                 return
            except Exception as e:
                st.exception(f"An unexpected error occurred during the crawl: {e}")
                return

        if result.status_code != 200:
            st.error(f"Exploration failed with status {result.status_code}: {result.text}")
            return
            
        data = result.json()
        
        display_map(data.get("link_map", {}))

        st.subheader("💰 Accumulated Token Usage (All LLM Calls)")
        token_usage = data.get("token_usage", {"input": 0, "output": 0, "total": 0})
        st.write(f"**Input Tokens:** {token_usage['input']}")
        st.write(f"**Output Tokens:** {token_usage['output']}")
        st.write(f"**Total Tokens:** {token_usage['total']}")
        
if __name__ == "__main__":
    main()