Spaces:

theYEH
/

Crawl4AI

Sleeping

Crawl4AI / docs /examples /summarize_page.py

amaye15

test

03c0888 over 1 year ago

1.78 kB

	import os
	import time
	import json
	from crawl4ai.web_crawler import WebCrawler
	from crawl4ai.chunking_strategy import *
	from crawl4ai.extraction_strategy import *
	from crawl4ai.crawler_strategy import *

	url = r'https://marketplace.visualstudio.com/items?itemName=Unclecode.groqopilot'

	crawler = WebCrawler()
	crawler.warmup()

	from pydantic import BaseModel, Field

	class PageSummary(BaseModel):
	title: str = Field(..., description="Title of the page.")
	summary: str = Field(..., description="Summary of the page.")
	brief_summary: str = Field(..., description="Brief summary of the page.")
	keywords: list = Field(..., description="Keywords assigned to the page.")

	result = crawler.run(
	url=url,
	word_count_threshold=1,
	extraction_strategy= LLMExtractionStrategy(
	provider= "openai/gpt-4o", api_token = os.getenv('OPENAI_API_KEY'),
	schema=PageSummary.model_json_schema(),
	extraction_type="schema",
	apply_chunking =False,
	instruction="From the crawled content, extract the following details: "\
	"1. Title of the page "\
	"2. Summary of the page, which is a detailed summary "\
	"3. Brief summary of the page, which is a paragraph text "\
	"4. Keywords assigned to the page, which is a list of keywords. "\
	'The extracted JSON format should look like this: '\
	'{ "title": "Page Title", "summary": "Detailed summary of the page.", "brief_summary": "Brief summary in a paragraph.", "keywords": ["keyword1", "keyword2", "keyword3"] }'
	),
	bypass_cache=True,
	)

	page_summary = json.loads(result.extracted_content)

	print(page_summary)

	with open(".data/page_summary.json", "w", encoding="utf-8") as f:
	f.write(result.extracted_content)