Spaces:
Build error
Build error
| from urllib.parse import urlparse | |
| from loguru import logger | |
| from tqdm import tqdm | |
| from typing_extensions import Annotated | |
| from clearml import PipelineDecorator | |
| from llm_engineering.application.crawlers.dispatcher import CrawlerDispatcher | |
| def crawl_links(links: list[str]) -> Annotated[list[str], "crawled_links"]: | |
| def _crawl_link(dispatcher: CrawlerDispatcher, link: str) -> tuple[bool, str]: | |
| # Logic for crawling | |
| crawler = dispatcher.get_crawler(link) | |
| crawler_domain = urlparse(link).netloc | |
| try: | |
| crawler.extract(link=link) | |
| return (True, crawler_domain) | |
| except Exception as e: | |
| logger.error(f"An error occurred while crawling: {e!s}") | |
| return (False, crawler_domain) | |
| def _add_to_metadata(metadata: dict, domain: str, successfull_crawl: bool) -> dict: | |
| if domain not in metadata: | |
| metadata[domain] = {} | |
| metadata[domain]["successful"] = metadata.get(domain, {}).get("successful", 0) + successfull_crawl | |
| metadata[domain]["total"] = metadata.get(domain, {}).get("total", 0) + 1 | |
| return metadata | |
| dispatcher = CrawlerDispatcher.build().register_github() | |
| logger.info(f"Starting to crawl {len(links)} link(s).") | |
| metadata = {} | |
| successfull_crawls = 0 | |
| for link in tqdm(links): | |
| successfull_crawl, crawled_domain = _crawl_link(dispatcher, link) | |
| successfull_crawls += successfull_crawl | |
| metadata = _add_to_metadata(metadata, crawled_domain, successfull_crawl) | |
| logger.info(f"Successfully crawled {successfull_crawls} / {len(links)} links.") | |
| return links | |