Soham Waghmare commited on
Commit
87d5bfc
·
1 Parent(s): 51f3191

feat: langgraph implementation for knet with SSE

Browse files
.gitignore CHANGED
@@ -1,16 +1,17 @@
1
  **/.env
2
 
3
  # Flask ignore files
4
- backend/__pycache__/
5
- backend/*.pyc
6
- backend/*.pyo
7
- backend/*.pyd
8
- backend/*.pyo
9
- backend/.venv/
10
- backend/.env*
11
- backend/downloads/*
12
- backend/*.log.*
13
- backend/.ruff_cache/
 
14
 
15
  # Next.js ignore files
16
  # See https://help.github.com/articles/ignoring-files/ for more about ignoring files.
 
1
  **/.env
2
 
3
  # Flask ignore files
4
+ **/__pycache__/
5
+ **/*.pyc
6
+ **/*.pyo
7
+ **/*.pyd
8
+ **/*.pyo
9
+ **/*.csv
10
+ **/.venv/
11
+ **/.env*
12
+ **/downloads/*
13
+ **/*.log.*
14
+ **/.ruff_cache/
15
 
16
  # Next.js ignore files
17
  # See https://help.github.com/articles/ignoring-files/ for more about ignoring files.
langgraph_backend/app.py ADDED
@@ -0,0 +1,301 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import asyncio
2
+ import json
3
+ import logging
4
+ import os
5
+ from datetime import datetime
6
+ from textwrap import dedent
7
+ from typing import Any, Dict, List, Optional, TypedDict
8
+
9
+ from dotenv import load_dotenv
10
+ from fastapi import FastAPI, Request
11
+ from fastapi.middleware.cors import CORSMiddleware
12
+ from fastapi.responses import StreamingResponse
13
+ from langchain_google_genai import ChatGoogleGenerativeAI
14
+ from langgraph.graph import END, StateGraph
15
+ from sse_starlette.sse import EventSourceResponse
16
+
17
+ from schema import ResearchPlan
18
+ from scraper import CrawlForAIScraper
19
+
20
+ load_dotenv()
21
+
22
+ logger = logging.getLogger(__name__)
23
+ logging.basicConfig(level=logging.INFO)
24
+
25
+ app = FastAPI()
26
+ CORS_ALLOWED_ORIGINS = os.getenv("ALLOWED_ORIGINS", ",").split(",")
27
+ app.add_middleware(
28
+ CORSMiddleware,
29
+ allow_origins=CORS_ALLOWED_ORIGINS,
30
+ allow_credentials=True,
31
+ allow_methods=["*"],
32
+ allow_headers=["*"],
33
+ )
34
+
35
+ # Session management (in-memory for now)
36
+ sessions: Dict[str, Dict[str, Any]] = {}
37
+
38
+
39
+ @app.get("/health")
40
+ async def health_check():
41
+ return {"status": "ok"}
42
+
43
+
44
+ # --- Prompt templates ---
45
+ RESEARCH_PLAN_PROMPT = dedent("""You are an expert Deep Research agent, part of a Multiagent system.
46
+
47
+ <User query>
48
+ {topic}
49
+ </User query>
50
+
51
+ ---
52
+ Generate few very high level steps on which other agents can do info collection runs. Provide only data collection steps, no data identification, summarization, manipulation, selection, etc.
53
+ Do not presume any knowledge about the topic.
54
+ Return a string array of steps.""")
55
+
56
+ REPORT_OUTLINE_PROMPT = dedent("""Generate a outline for a report based on the findings:
57
+ <Original user query>
58
+ {topic}
59
+ </Original user query>
60
+
61
+ <Findings>
62
+ {ctx_manager}
63
+ </Findings>
64
+
65
+ Deduplicate, reorganize and analyze the findings to create the outline.
66
+ If there are multiple comparisons, use a table instead of multiple headings.
67
+ The outline should include:
68
+ - Title
69
+ - List of h2 headings
70
+ Do not include hashtags""")
71
+
72
+ REPORT_FILLIN_PROMPT = dedent("""Fill in the content for the current outline heading based on the findings:
73
+ <Findings>
74
+ {ctx_manager}
75
+ </Findings>
76
+
77
+ <The outline>
78
+ {report_outline}
79
+ </The outline>
80
+
81
+ <Current outline heading to fill in>
82
+ ## {slot}
83
+ ...
84
+ </Current outline heading to fill in>
85
+
86
+ Assume [done] headings have their respective content.
87
+ The content should be comprehensive, detailed and well-structured, providing detailed information on current heading.
88
+ If needed use tables, lists. Do not include subheadings.
89
+ Do not include the heading in the content.
90
+ """)
91
+
92
+ # --- LangChain LLM setup (Gemini, correct usage) ---
93
+ llm = ChatGoogleGenerativeAI(model="gemini-2.0-flash", google_api_key=os.getenv("GOOGLE_API_KEY"))
94
+
95
+
96
+ # --- State schema for LangGraph ---
97
+ class ResearchState(TypedDict, total=False):
98
+ topic: str
99
+ scraper: Any
100
+ max_depth: int
101
+ num_sites_per_query: int
102
+ steps: List[str]
103
+ findings: Any
104
+ outline: str
105
+ progress: int
106
+ message: str
107
+ timestamp: str
108
+ content: str
109
+ media: dict
110
+ research_tree: dict
111
+ metadata: dict
112
+
113
+
114
+ # --- LangGraph node: LLM step for research plan ---
115
+ async def research_plan_node(state: dict) -> dict:
116
+ topic = state["topic"]
117
+ prompt = RESEARCH_PLAN_PROMPT.format(topic=topic)
118
+ result = await llm.with_structured_output(ResearchPlan).ainvoke(prompt)
119
+ try:
120
+ steps = json.loads(result.content) if hasattr(result, "content") else json.loads(str(result))
121
+ # TODO: split this module another knet module to handle global state
122
+ except Exception:
123
+ steps = [str(result)]
124
+ logger.info(f"Research plan:\n{json.dumps(steps, indent=2)}")
125
+ return {"progress": 10, "message": "Generated research plan"}
126
+
127
+
128
+ # --- LangGraph node: Scrape for each step ---
129
+ async def scrape_node(state: dict) -> dict:
130
+ steps = state["steps"]
131
+ scraper = state["scraper"]
132
+ num_sites_per_query = state["num_sites_per_query"]
133
+ findings = []
134
+ for idx, step in enumerate(steps):
135
+ scraped = await scraper.search_and_scrape(step, num_sites=num_sites_per_query)
136
+ findings.append({"step": step, "data": scraped})
137
+ return {"findings": findings, "progress": 70, "message": "Scraping complete"}
138
+
139
+
140
+ # --- LangGraph node: Generate report outline ---
141
+ async def outline_node(state: dict) -> dict:
142
+ topic = state["topic"]
143
+ findings = state["findings"]
144
+ findings_text = json.dumps(findings, indent=2)
145
+ prompt = REPORT_OUTLINE_PROMPT.format(topic=topic, findings=findings_text)
146
+ result = await llm.ainvoke(prompt)
147
+ outline = result.content if hasattr(result, "content") else str(result)
148
+ return {"outline": outline, "progress": 90, "message": "Generated report outline"}
149
+
150
+
151
+ # --- LangGraph node: Fill in report content for each heading ---
152
+ async def fillin_node(state: dict) -> dict:
153
+ findings = state["findings"]
154
+ outline = state["outline"]
155
+ topic = state["topic"]
156
+ # Try to parse outline as JSON, else fallback to text splitting
157
+ try:
158
+ outline_obj = json.loads(outline)
159
+ title = outline_obj["title"]
160
+ headings = outline_obj["headings"]
161
+ except Exception:
162
+ # Fallback: try to extract headings from text
163
+ lines = outline.splitlines()
164
+ title = lines[0].strip("# ") if lines else topic
165
+ headings = [line.strip("# ") for line in lines if line.strip().startswith("## ")]
166
+ findings_text = json.dumps(findings, indent=2)
167
+ report = f"# {title}\n\n"
168
+ for idx, heading in enumerate(headings):
169
+ prompt = REPORT_FILLIN_PROMPT.format(
170
+ findings=findings_text,
171
+ outline=outline,
172
+ slot=heading,
173
+ )
174
+ result = await llm.ainvoke(prompt)
175
+ content = result.content if hasattr(result, "content") else str(result)
176
+ # Remove heading if LLM included it
177
+ if content.strip().startswith(heading):
178
+ content = content.strip()[len(heading) :].strip()
179
+ report += f"\n\n## {heading}\n\n{content}\n"
180
+ return {"content": report, "progress": 95, "message": "Filled in report content"}
181
+
182
+
183
+ # --- LangGraph node: Finalize report ---
184
+ def finalize_node(state: dict) -> dict:
185
+ findings = state.get("findings", [])
186
+ media = {"images": [], "videos": [], "links": []}
187
+ for step in findings:
188
+ for site in step.get("data", []):
189
+ media["images"].extend(site.get("images", []))
190
+ media["videos"].extend(site.get("videos", []))
191
+ media["links"].extend(site.get("links", []))
192
+ # Dedupe
193
+ media["images"] = list(set(media["images"]))
194
+ media["videos"] = list(set(media["videos"]))
195
+ # Links: dedupe by URL
196
+ seen_links = set()
197
+ deduped_links = []
198
+ for link in media["links"]:
199
+ url = link["href"] if isinstance(link, dict) and "href" in link else str(link)
200
+ if url not in seen_links:
201
+ seen_links.add(url)
202
+ deduped_links.append(link)
203
+ media["links"] = deduped_links
204
+ return {
205
+ "topic": state["topic"],
206
+ "timestamp": datetime.now().isoformat(),
207
+ "content": state["content"],
208
+ "media": media,
209
+ "research_tree": {},
210
+ "metadata": {"steps": state.get("steps", [])},
211
+ "progress": 100,
212
+ "message": "Research complete!",
213
+ }
214
+
215
+
216
+ # --- Main research logic using LangGraph ---
217
+ async def run_research(topic, scraper, max_depth, num_sites_per_query):
218
+ # Build the research graph
219
+ graph = StateGraph(state_schema=ResearchState)
220
+ graph.add_node("plan", research_plan_node)
221
+ graph.add_node("scrape", scrape_node)
222
+ graph.add_node("outline_node", outline_node)
223
+ graph.add_node("fillin", fillin_node)
224
+ graph.add_node("finalize", finalize_node)
225
+
226
+ graph.add_edge("plan", "scrape")
227
+ graph.add_edge("scrape", "outline_node")
228
+ graph.add_edge("outline_node", "fillin")
229
+ graph.add_edge("fillin", "finalize")
230
+ graph.add_edge("finalize", END)
231
+ graph.set_entry_point("plan")
232
+ graph = graph.compile()
233
+
234
+ state = {
235
+ "topic": topic,
236
+ "scraper": scraper,
237
+ "max_depth": max_depth,
238
+ "num_sites_per_query": num_sites_per_query,
239
+ }
240
+ async for step in graph.astream(state):
241
+ progress = step.get("progress", 0)
242
+ message = step.get("message", "")
243
+ yield {
244
+ "event": "status",
245
+ "data": json.dumps({"progress": progress, "message": message}),
246
+ }
247
+ yield {
248
+ "event": "research_complete",
249
+ "data": json.dumps(
250
+ {
251
+ "topic": step["topic"],
252
+ "timestamp": step["timestamp"],
253
+ "content": step["content"],
254
+ "media": step["media"],
255
+ "research_tree": step["research_tree"],
256
+ "metadata": step["metadata"],
257
+ }
258
+ ),
259
+ }
260
+
261
+
262
+ @app.post("/start_research")
263
+ async def start_research(request: Request):
264
+ data = await request.json()
265
+ topic = data.get("topic", "").strip()
266
+ max_depth = int(data.get("max_depth", 1))
267
+ num_sites_per_query = int(data.get("num_sites_per_query", 5))
268
+ session_id = data.get("session_id") or os.urandom(8).hex()
269
+
270
+ if session_id not in sessions:
271
+ scraper = CrawlForAIScraper()
272
+ await scraper.start()
273
+ sessions[session_id] = {"scraper": scraper}
274
+ else:
275
+ scraper = sessions[session_id]["scraper"]
276
+
277
+ async def event_generator():
278
+ async for event in run_research(topic, scraper, max_depth, num_sites_per_query):
279
+ yield event
280
+
281
+ return EventSourceResponse(event_generator())
282
+
283
+
284
+ @app.post("/abort_research")
285
+ async def abort_research(request: Request):
286
+ data = await request.json()
287
+ session_id = data.get("session_id")
288
+ if session_id in sessions:
289
+ scraper = sessions[session_id]["scraper"]
290
+ await scraper.close()
291
+ del sessions[session_id]
292
+ return {"status": "aborted"}
293
+
294
+
295
+ # Add more endpoints as needed for test, etc.
296
+
297
+ if __name__ == "__main__":
298
+ logger.info("Starting KnowledgeNet server...")
299
+ import uvicorn
300
+
301
+ uvicorn.run(app, host="127.0.0.1", port=5000)
langgraph_backend/pyproject.toml ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [project]
2
+ name = "langgraph-backend"
3
+ version = "0.1.0"
4
+ requires-python = ">=3.11"
5
+ dependencies = [
6
+ "bs4>=0.0.2",
7
+ "crawl4ai>=0.6.3",
8
+ "fastapi>=0.115.12",
9
+ "langchain[google-genai]>=0.3.25",
10
+ "langgraph>=0.4.3",
11
+ "python-dotenv>=1.1.0",
12
+ "sse-starlette>=2.3.5",
13
+ "uvicorn>=0.34.2",
14
+ ]
langgraph_backend/schema.py ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import List, TypedDict
2
+
3
+
4
+ class ResearchPlan(TypedDict):
5
+ steps: List[str]
6
+
7
+
8
+ class ContinueBranch(TypedDict):
9
+ decision: bool
10
+
11
+
12
+ class SearchQuery(TypedDict):
13
+ branches: List[str]
14
+
15
+
16
+ class ReportOutline(TypedDict):
17
+ title: str
18
+ headings: List[str]
19
+
20
+
21
+ class ReportFillin(TypedDict):
22
+ content: str
langgraph_backend/scraper.py ADDED
@@ -0,0 +1,283 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import asyncio
2
+ import json
3
+ import logging
4
+ import time
5
+ from typing import Any, Dict, List
6
+ from urllib.parse import quote_plus
7
+
8
+ import requests
9
+ from bs4 import BeautifulSoup
10
+ from crawl4ai import AsyncWebCrawler, BrowserConfig, CacheMode
11
+
12
+
13
+ class CrawlForAIScraper:
14
+ def __init__(self) -> None:
15
+ self.logger = logging.getLogger(__name__)
16
+ self.session = requests.Session()
17
+ self.base_browser = BrowserConfig(
18
+ browser_type="chromium",
19
+ headless=True,
20
+ viewport_width=1920,
21
+ viewport_height=1080,
22
+ accept_downloads=False,
23
+ verbose=False,
24
+ )
25
+ self.crawler = AsyncWebCrawler(config=self.base_browser)
26
+ self._is_started = False
27
+
28
+ async def start(self):
29
+ if not self._is_started:
30
+ await self.crawler.start()
31
+ time.sleep(1)
32
+ self._is_started = True
33
+
34
+ async def close(self):
35
+ if self._is_started:
36
+ await self.crawler.close()
37
+ self._is_started = False
38
+
39
+ async def search_and_scrape(self, query: str, num_sites: int = 10) -> List[Dict[str, Any]]:
40
+ await self.start()
41
+ self.logger.info(f"Querying: {query}")
42
+
43
+ # Perform a search to get a list of webpages
44
+ search_results = await self._search(query)
45
+
46
+ # Scrape each webpage
47
+ scraped_data = []
48
+ self.logger.info(f"Scraping {num_sites} sites...")
49
+ data = await self._scrape_pages(search_results[: num_sites + 2], num_sites)
50
+ scraped_data.extend(data)
51
+
52
+ # Scrape next pages when some failed
53
+ for _ in range(3):
54
+ if len(scraped_data) < num_sites:
55
+ idx_last_page = search_results.index(search_results[-1])
56
+ data = await self._scrape_pages(search_results[idx_last_page + 1 : num_sites + 2], num_sites)
57
+ scraped_data.extend(data)
58
+
59
+ self.logger.info(f"Completed scraping {len(scraped_data)} sites")
60
+ return scraped_data
61
+
62
+ async def _search(self, query: str) -> List[str]:
63
+ try:
64
+ encoded_query = quote_plus(query)
65
+ search_uri = f"https://www.google.com/search?q={encoded_query}"
66
+
67
+ result = await self.crawler.arun(
68
+ url=search_uri,
69
+ screenshot=False,
70
+ cache_mode=CacheMode.BYPASS,
71
+ delay_before_return_html=2,
72
+ scan_full_page=True,
73
+ )
74
+
75
+ soup = BeautifulSoup(result.html, "html.parser")
76
+ search_results = []
77
+
78
+ for link in list(soup.select("div > span > a"))[2:]:
79
+ url = link.get("href").replace(" ", "").replace("\n", "").strip()
80
+ if not url.startswith(("http://", "https://")):
81
+ url = "https://" + url
82
+ if "support.google.com" in url or url.startswith("/search?q="):
83
+ continue
84
+ search_results.append(url)
85
+
86
+ for _ in range(3):
87
+ if not search_results:
88
+ self.logger.info("Performing DuckDuckGo search as fallback...")
89
+ self.logger.warning("No search results found.")
90
+ search_results = await self._duckduckgo_search(query)
91
+
92
+ if not search_results:
93
+ raise Exception("No results found")
94
+ self.logger.info(f"Found {len(search_results)} results")
95
+ return search_results
96
+
97
+ except Exception as e:
98
+ self.logger.error(f"Google search error: {str(e)}", exc_info=True)
99
+ raise
100
+
101
+ async def _duckduckgo_search(self, query: str) -> List[str]:
102
+ self.logger.info("Performing DuckDuckGo search...")
103
+ try:
104
+ encoded_query = quote_plus(query)
105
+ search_uri = f"https://html.duckduckgo.com/html/?q={encoded_query}"
106
+
107
+ # response = self.session.get(
108
+ # url,
109
+ # headers={
110
+ # "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
111
+ # },
112
+ # timeout=10,
113
+ # )
114
+ # response.raise_for_status()
115
+
116
+ result = await self.crawler.arun(
117
+ url=search_uri,
118
+ screenshot=False,
119
+ cache_mode=CacheMode.BYPASS,
120
+ delay_before_return_html=2,
121
+ scan_full_page=True,
122
+ )
123
+
124
+ soup = BeautifulSoup(result.html, "html.parser")
125
+ search_results = []
126
+
127
+ # DuckDuckGo search results are in elements with class 'result__url'
128
+ for result in soup.select(".result__url"):
129
+ url = result.get("href").replace(" ", "").replace("\\n", "")
130
+ if not url.startswith(("http://", "https://")):
131
+ url = "https://" + url
132
+ search_results.append(url)
133
+
134
+ self.logger.info(f"Found {len(search_results)} URLs")
135
+ return search_results
136
+
137
+ except requests.exceptions.RequestException as e: # Catch network errors specifically
138
+ self.logger.error(f"DuckDuckGo search error: {str(e)}")
139
+ return []
140
+ except Exception as e: # Catch any other errors
141
+ self.logger.error(f"DuckDuckGo search error: {str(e)}")
142
+ return []
143
+
144
+ async def _scrape_pages(self, urls: str, max_sites: int) -> Dict[str, Any]:
145
+ await self.start()
146
+
147
+ try:
148
+ # Run the crawler on a URL
149
+ results = await self.crawler.arun_many(
150
+ urls=urls,
151
+ screenshot=False,
152
+ cache_mode=CacheMode.BYPASS,
153
+ scan_full_page=True,
154
+ semaphore_count=4,
155
+ wait_for_images=True,
156
+ scroll_delay=0.1,
157
+ delay_before_return_html=2,
158
+ exclude_external_images=True,
159
+ page_timeout=25000,
160
+ )
161
+ scraped_sites = []
162
+ for result in results:
163
+ if result.success:
164
+ soup = BeautifulSoup(result.html, "html.parser")
165
+
166
+ # Combine images
167
+ extracted_images = self._extract_images(soup, result.url)
168
+ media_images = []
169
+ for img in result.media["images"]:
170
+ if img["width"] is None or (isinstance(img["width"], (int, float)) and img["width"] > 300):
171
+ # Resolve multiple URLs in the src attribute
172
+ src = img["src"]
173
+ if " " in src and "w," in src:
174
+ urls = [url.strip() for url in src.split(" ") if url.strip()]
175
+ if urls:
176
+ last_url = urls[-1].split(" ")[0]
177
+ media_images.append(last_url)
178
+ else:
179
+ media_images.append(src)
180
+ all_images = list(set(extracted_images + media_images))
181
+
182
+ # Combine videos
183
+ all_videos = self._extract_videos(soup)
184
+ media_videos = [v["src"] for v in result.media["videos"] if v["src"]]
185
+ all_videos = list(set(all_videos + media_videos))
186
+
187
+ data = {
188
+ "url": result.url,
189
+ "text": result.markdown,
190
+ "images": all_images,
191
+ "videos": all_videos,
192
+ "links": self._extract_links(result.links["external"]),
193
+ }
194
+ scraped_sites.append(data)
195
+ self.logger.info(f" - {result.url[:80]}...")
196
+ return scraped_sites[:max_sites]
197
+
198
+ except Exception as e:
199
+ self.logger.error(f"Scraping error while {urls}: {str(e)}")
200
+ return {}
201
+
202
+ def _extract_images(self, soup: BeautifulSoup, url: str) -> List[str]:
203
+ # Extract images with width and height greater than 300 pixels
204
+ images = []
205
+ for img in soup.find_all("img"):
206
+ if "src" in img.attrs:
207
+ src = img["src"]
208
+ if not "width" or "height" not in img.attrs:
209
+ continue
210
+ if "width" in img.attrs and img.get("width").lower() == "auto":
211
+ images.append((src, 999, 0))
212
+ # Remove units from width and height: get start of the entity till the first non-digit character
213
+ width = "".join([i for i in img.get("width", "0") if i.isdigit() or i == "."])
214
+ height = "".join([i for i in img.get("height", "0") if i.isdigit() or i == "."])
215
+ if width == "" or height == "":
216
+ continue
217
+ width, height = float(width), float(height)
218
+ if width > 300 and height > 300 and "pixel" not in src and "icon" not in src:
219
+ images.append((src, width, height))
220
+ images = sorted(images, key=lambda img: -1 * (img[1] * img[2]))
221
+ images = [img[0] for img in images]
222
+
223
+ # Add base URL to relative URLs
224
+ base_url = "/".join(url.split("/")[:3])
225
+ images = [img if img.startswith("http") else base_url + img for img in images]
226
+ return images
227
+
228
+ def _extract_videos(self, soup: BeautifulSoup) -> List[str]:
229
+ # Extract videos from iframes and video tags
230
+ videos = []
231
+ nodes = list(soup.find_all("iframe")) + list(soup.find_all("video")) + list(soup.find_all("a"))
232
+ for node in nodes:
233
+ if not any(
234
+ keyword in node.get("src", "") or keyword in node.get("href", "")
235
+ for keyword in ["accounts.google.com", "blob:", "youtube.com/redirect"]
236
+ ):
237
+ continue
238
+ elif (
239
+ any(node.name in tag for tag in ["video", "iframe", "a"])
240
+ and "www.youtube.com/watch?v" in node.get("src", "")
241
+ or "www.youtube.com/watch?v" in node.get("href", "")
242
+ ):
243
+ videos.append(node.get("src", ""))
244
+ return videos
245
+
246
+ def _extract_links(self, links: list) -> List[str]:
247
+ # Filter out unwanted links
248
+ filtered_links = []
249
+ for link in links:
250
+ url = link.get("href")
251
+ if url.startswith(("http://", "https://")) and not any(
252
+ keyword in url
253
+ for keyword in ["support.google.com", "google.com", "accounts.google.com", "youtube.com", "blob:", "mailto:", "javascript:"]
254
+ ):
255
+ filtered_links.append(link)
256
+ return filtered_links
257
+
258
+
259
+ if __name__ == "__main__":
260
+ # Testing the scraper
261
+ import sys
262
+
263
+ urls = [
264
+ "https://docs.anthropic.com/en/docs/agents-and-tools/claude-code/overview",
265
+ "https://docs.crawl4ai.com/advanced/multi-url-crawling/",
266
+ "https://github.com/SesameAILabs/csm",
267
+ "https://docs.anthropic.com/en/docs/agents-and-tools/claude-code/overview",
268
+ "https://docs.crawl4ai.com/advanced/multi-url-crawling/",
269
+ "https://github.com/SesameAILabs/csm",
270
+ ]
271
+ if len(sys.argv) > 1:
272
+ urls = sys.argv[1:]
273
+
274
+ async def main():
275
+ scraper = CrawlForAIScraper()
276
+ await scraper.start()
277
+ data = await scraper.search_and_scrape("blender.org")
278
+ await scraper.close()
279
+ with open("output.log.json", "w") as f:
280
+ f.write(json.dumps(data, indent=2))
281
+ print(json.dumps(data, indent=2))
282
+
283
+ asyncio.run(main())
langgraph_backend/uv.lock ADDED
The diff for this file is too large to render. See raw diff