Spaces:
Sleeping
Sleeping
File size: 4,053 Bytes
c32cdfb c91b827 c32cdfb | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 | """
Web scraping module
Scrapes web pages using Firecrawl and stores in Qdrant
"""
import os
import sys
from pathlib import Path
from dotenv import load_dotenv
from firecrawl import FirecrawlApp
from langchain_core.documents import Document
from datetime import datetime
from qdrant_client import QdrantClient
# Add parent directory to path for imports
current_dir = Path(__file__).resolve().parent
parent_dir = current_dir.parent
if str(parent_dir) not in sys.path:
sys.path.insert(0, str(parent_dir))
from src.vector_store import process_and_store
load_dotenv()
def check_url_exists(url: str) -> int:
"""
Check if URL already exists in Qdrant
Args:
url: URL to check
Returns:
Number of existing chunks for this URL (0 if not found)
"""
client = QdrantClient(
url=os.getenv("QDRANT_URL"),
api_key=os.getenv("QDRANT_API_KEY")
)
collection_name = os.getenv("QDRANT_COLLECTION", "hr-intervals")
try:
result = client.scroll(
collection_name=collection_name,
limit=1,
scroll_filter={
"must": [{"key": "metadata.source", "match": {"value": url}}]
},
with_payload=False
)
# Count total chunks for this URL
count_result = client.count(
collection_name=collection_name,
count_filter={
"must": [{"key": "metadata.source", "match": {"value": url}}]
}
)
return count_result.count
except Exception:
return 0
def scrape_url(url: str) -> str:
"""
Scrape webpage content using Firecrawl
Args:
url: URL to scrape
Returns:
Markdown content of the webpage
"""
print(f"🌐 Scraping: {url}")
app = FirecrawlApp(api_key=os.getenv("FIRECRAWL_API_KEY"))
result = app.scrape(url, formats=['markdown'])
# Handle different return types
if hasattr(result, 'markdown'):
markdown_content = result.markdown
elif isinstance(result, dict) and 'markdown' in result:
markdown_content = result['markdown']
else:
raise ValueError(f"Failed to scrape - unexpected result type: {type(result)}")
if not markdown_content:
raise ValueError("Failed to scrape - no content retrieved")
return markdown_content
def process_and_store_webpage(url: str, force: bool = False) -> int:
"""
Scrape webpage and store in vector database
Args:
url: URL to scrape
force: If True, skip duplicate check and store anyway
Returns:
Number of chunks created
Raises:
ValueError: If URL already exists and force=False
"""
# 0. Check if URL already exists
if not force:
existing_chunks = check_url_exists(url)
if existing_chunks > 0:
raise ValueError(
f"URL already exists with {existing_chunks} chunks. "
f"Use 'Delete' to remove it first, or force=True to add anyway."
)
# 1. Scrape content
markdown_content = scrape_url(url)
print(f" ✅ Scraped {len(markdown_content)} characters")
# 2. Create document with metadata
doc = Document(
page_content=markdown_content,
metadata={
"source": url,
"type": "webpage",
"upload_date": datetime.now().strftime("%Y-%m-%d")
}
)
# 3. Chunk and store (using shared function)
num_chunks = process_and_store([doc])
return num_chunks
# Test function
if __name__ == "__main__":
print("🧪 Testing web scraper...")
# Test with a simple webpage
test_url = "https://hrintervals.ca/resources/sample-policy-inclusive-and-equitable-hiring-practices/"
try:
num_chunks = process_and_store_webpage(test_url)
print(f"\n🎉 Success! Processed {num_chunks} chunks")
except Exception as e:
print(f"\n❌ Error: {str(e)}") |