File size: 4,053 Bytes
c32cdfb
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
c91b827
c32cdfb
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
"""
Web scraping module
Scrapes web pages using Firecrawl and stores in Qdrant
"""

import os
import sys
from pathlib import Path
from dotenv import load_dotenv
from firecrawl import FirecrawlApp
from langchain_core.documents import Document
from datetime import datetime
from qdrant_client import QdrantClient

# Add parent directory to path for imports
current_dir = Path(__file__).resolve().parent
parent_dir = current_dir.parent
if str(parent_dir) not in sys.path:
    sys.path.insert(0, str(parent_dir))

from src.vector_store import process_and_store

load_dotenv()


def check_url_exists(url: str) -> int:
    """
    Check if URL already exists in Qdrant
    
    Args:
        url: URL to check
        
    Returns:
        Number of existing chunks for this URL (0 if not found)
    """
    client = QdrantClient(
        url=os.getenv("QDRANT_URL"),
        api_key=os.getenv("QDRANT_API_KEY")
    )
    collection_name = os.getenv("QDRANT_COLLECTION", "hr-intervals")
    
    try:
        result = client.scroll(
            collection_name=collection_name,
            limit=1,
            scroll_filter={
                "must": [{"key": "metadata.source", "match": {"value": url}}]
            },
            with_payload=False
        )
        
        # Count total chunks for this URL
        count_result = client.count(
            collection_name=collection_name,
            count_filter={
                "must": [{"key": "metadata.source", "match": {"value": url}}]
            }
        )
        return count_result.count
    except Exception:
        return 0


def scrape_url(url: str) -> str:
    """
    Scrape webpage content using Firecrawl
    
    Args:
        url: URL to scrape
        
    Returns:
        Markdown content of the webpage
    """
    print(f"🌐 Scraping: {url}")
    
    app = FirecrawlApp(api_key=os.getenv("FIRECRAWL_API_KEY"))
    result = app.scrape(url, formats=['markdown'])
    
    # Handle different return types
    if hasattr(result, 'markdown'):
        markdown_content = result.markdown
    elif isinstance(result, dict) and 'markdown' in result:
        markdown_content = result['markdown']
    else:
        raise ValueError(f"Failed to scrape - unexpected result type: {type(result)}")
    
    if not markdown_content:
        raise ValueError("Failed to scrape - no content retrieved")
    
    return markdown_content


def process_and_store_webpage(url: str, force: bool = False) -> int:
    """
    Scrape webpage and store in vector database
    
    Args:
        url: URL to scrape
        force: If True, skip duplicate check and store anyway
        
    Returns:
        Number of chunks created
        
    Raises:
        ValueError: If URL already exists and force=False
    """
    
    # 0. Check if URL already exists
    if not force:
        existing_chunks = check_url_exists(url)
        if existing_chunks > 0:
            raise ValueError(
                f"URL already exists with {existing_chunks} chunks. "
                f"Use 'Delete' to remove it first, or force=True to add anyway."
            )
    
    # 1. Scrape content
    markdown_content = scrape_url(url)
    print(f"   ✅ Scraped {len(markdown_content)} characters")
    
    # 2. Create document with metadata
    doc = Document(
        page_content=markdown_content,
        metadata={
            "source": url,
            "type": "webpage",
            "upload_date": datetime.now().strftime("%Y-%m-%d")
        }
    )
    
    # 3. Chunk and store (using shared function)
    num_chunks = process_and_store([doc])
    
    return num_chunks


# Test function
if __name__ == "__main__":
    print("🧪 Testing web scraper...")
    
    # Test with a simple webpage
    test_url = "https://hrintervals.ca/resources/sample-policy-inclusive-and-equitable-hiring-practices/"
    
    try:
        num_chunks = process_and_store_webpage(test_url)
        print(f"\n🎉 Success! Processed {num_chunks} chunks")
    except Exception as e:
        print(f"\n❌ Error: {str(e)}")