File size: 5,308 Bytes
7b33cb7
e3a58b9
69521c1
e3a58b9
 
 
 
7b33cb7
e3a58b9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
e2304c7
e3a58b9
e2304c7
 
 
 
 
 
 
 
 
 
 
 
 
 
e3a58b9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
a1b789f
e3a58b9
 
 
 
 
 
 
 
 
 
 
976bea9
e3a58b9
 
 
 
 
69521c1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
e3a58b9
69521c1
 
 
 
 
e3a58b9
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
import streamlit as st
from typing import Dict, Any 
import requests
from models import LinkNode, Status
from typing import Dict, Any
import os
from dotenv import load_dotenv

load_dotenv()

def display_map(link_map: Dict[str, Any]):
    """
    Displays the entire link map in collapsible Streamlit expanders.
    If a link is not relevant based on its overview, it's tagged with a red icon.
    """
    st.header("🌐 Full Exploration Map")

    if not link_map:
        st.info("The exploration map is empty.")
        return

    validated_map = {}
    for href, dict_node in link_map.items():
        try:
            node = LinkNode.model_validate(dict_node)
            validated_map[href] = node
        except Exception as e:
            st.error(f"Failed to validate data for {href}. Skipping. Error: {e}")
            continue

    sorted_map = sorted(validated_map.items(), key=lambda item: item[1].depth)

    for href, node in sorted_map:
        st.divider() 
        st.subheader(f"πŸ“„ [{href}]({href})")
        if node.parent:
            st.caption(f"Found on: {node.parent}")

        status = node.overview.status
        if status == Status.RELEVANT:
            st.success(f"**Status: RELEVANT** βœ…")
        elif status == Status.IRRELEVANT:
            st.warning(f"**Status: IRRELEVANT** ⚠️ - Page deemed not relevant to search criteria.")
        elif status == Status.FAILED:
            st.error(f"**Status: FAILED** ❌ - Could not scrape or analyze this page.")
        else:
            st.info(f"**Status: UNKNOWN** 🟑")

        st.markdown("**πŸ“ Summary**")
        st.info(node.overview.summary)

        with st.expander("View Full Extracted Data and Found Links"):
            st.markdown("##### πŸ“‹ Full Extracted Data")
            overview_data = node.overview.model_dump()
            
            display_order = ['details', 'required_docs', 'price', 'SLA']
            
            items_to_display = []
            for key in display_order:
                value = overview_data.get(key)
                if value:
                    title = key.replace('_', ' ').capitalize()
                    items_to_display.append((title, str(value)))

            for i, (title, value) in enumerate(items_to_display):
                st.markdown(f"**{title}**")
                st.markdown(value)
                if i < len(items_to_display) - 1: 
                    st.markdown("---")            
            st.markdown("##### πŸ”— Links Found on This Page")
            if node.child:
                st.write(f"Found **{len(node.child)}** link(s):")
                links_text = "\n".join(f"- {link}" for link in node.child)
                st.text_area("Links", links_text, height=150, key=f"links_{href}")
            else:
                st.write("No valid links were found on this page.")

def main():
    st.title("πŸ€– Browser Agent: Visa Data Extractor (Streamlit Demo)")
    st.markdown("Enter an API Key and a URL to start a recursive web crawl for structured visa information.")

    with st.sidebar:
        st.header("Configuration")

        default_url = "https://www.netherlandsworldwide.nl/visa-the-netherlands/visa-application-form"
        
        url = st.text_input("Starting URL (e.g., website.com)", default_url)

        max_depth = st.slider("Max Exploration Depth", min_value=1, max_value=5, value=1)
        
        st.markdown("""
        **Note:** Depth 1 is fast. Depth 2 or 3 can be **very slow** and consume many tokens.
        """)

    # --- Main Execution ---
    if st.button("Start Exploration and Extraction"):
        print(f"starting crawl for {url} with depth {max_depth}")
        if not url:
            st.error("Please enter a valid Starting URL.")
            return

        with st.spinner(f"Crawling {url} up to depth {max_depth}... (This may take a while)"):
            BASE_URI = os.getenv("BASE_URI", "http://localhost:5000")
            print(f"{BASE_URI}/scrape")
            try:
                result = requests.post(
                    f"{BASE_URI}/scrape",
                    headers={"Content-Type": "application/json"},
                    json={
                        "url": url,
                        "max_depth": max_depth
                    }
                )
            except requests.exceptions.ConnectionError:
                 st.error(f"Connection Error: Could not connect to the Flask API at {BASE_URI}. Please ensure your Flask app is running (e.g., `flask run`).")
                 return
            except Exception as e:
                st.exception(f"An unexpected error occurred during the crawl: {e}")
                return

        if result.status_code != 200:
            st.error(f"Exploration failed with status {result.status_code}: {result.text}")
            return
            
        data = result.json()
        
        display_map(data.get("link_map", {}))

        st.subheader("πŸ’° Accumulated Token Usage (All LLM Calls)")
        token_usage = data.get("token_usage", {"input": 0, "output": 0, "total": 0})
        st.write(f"**Input Tokens:** {token_usage['input']}")
        st.write(f"**Output Tokens:** {token_usage['output']}")
        st.write(f"**Total Tokens:** {token_usage['total']}")
        
if __name__ == "__main__":
    main()