Spaces:
Paused
Paused
Soham Waghmare
commited on
Commit
·
54a7d14
1
Parent(s):
ceae8b0
feat: improve scraping and context handling
Browse files- backend/.vscode/launch.json +25 -0
- backend/app.py +6 -12
- backend/crawl_ai.py +3 -1
- backend/knet.py +71 -122
- backend/output.json +620 -298
- backend/research_node.py +7 -4
- backend/scraper.py +82 -27
backend/.vscode/launch.json
ADDED
|
@@ -0,0 +1,25 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
// Use IntelliSense to learn about possible attributes.
|
| 3 |
+
// Hover to view descriptions of existing attributes.
|
| 4 |
+
// For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387
|
| 5 |
+
"version": "0.2.0",
|
| 6 |
+
"configurations": [
|
| 7 |
+
{
|
| 8 |
+
// Fastapi
|
| 9 |
+
"name": "Launch FastAPI",
|
| 10 |
+
"type": "debugpy",
|
| 11 |
+
"request": "launch",
|
| 12 |
+
"program": "${workspaceFolder}/app.py",
|
| 13 |
+
"args": ["run", "--reload"],
|
| 14 |
+
"justMyCode": true
|
| 15 |
+
},
|
| 16 |
+
{
|
| 17 |
+
"name": "Python Debugger: Current File",
|
| 18 |
+
"type": "debugpy",
|
| 19 |
+
"request": "launch",
|
| 20 |
+
"program": "${file}",
|
| 21 |
+
"console": "integratedTerminal",
|
| 22 |
+
"justMyCode": true
|
| 23 |
+
}
|
| 24 |
+
]
|
| 25 |
+
}
|
backend/app.py
CHANGED
|
@@ -15,12 +15,12 @@ logger = logging.getLogger(__name__)
|
|
| 15 |
|
| 16 |
app = FastAPI()
|
| 17 |
# Increased pingTimeout and added logger
|
| 18 |
-
sio = socketio.AsyncServer(cors_allowed_origins="*", ping_timeout=60, async_mode="asgi")
|
| 19 |
app.mount('/', socketio.ASGIApp(sio))
|
| 20 |
|
| 21 |
# Initialize the scraper and KNet
|
| 22 |
-
|
| 23 |
-
scraper_instance = WebScraper()
|
| 24 |
knet = KNet(scraper_instance)
|
| 25 |
|
| 26 |
|
|
@@ -50,20 +50,14 @@ async def start_research(sid, data):
|
|
| 50 |
|
| 51 |
async def progress_callback(status):
|
| 52 |
try:
|
| 53 |
-
logger.debug(
|
| 54 |
-
|
| 55 |
-
)
|
| 56 |
-
await sio.emit(
|
| 57 |
-
"status",
|
| 58 |
-
{"message": status["message"], "progress": status["progress"]},
|
| 59 |
-
room=session_id,
|
| 60 |
-
)
|
| 61 |
except Exception as e:
|
| 62 |
logger.error(f"Error in progress callback: {str(e)}")
|
| 63 |
raise e
|
| 64 |
|
| 65 |
try:
|
| 66 |
-
research_results = knet.conduct_research(topic, progress_callback)
|
| 67 |
logger.info(f"Research completed for topic: {topic}")
|
| 68 |
await sio.emit("research_complete", research_results, room=session_id)
|
| 69 |
except Exception as e:
|
|
|
|
| 15 |
|
| 16 |
app = FastAPI()
|
| 17 |
# Increased pingTimeout and added logger
|
| 18 |
+
sio = socketio.AsyncServer(cors_allowed_origins="*", ping_timeout=60, ping_interval=10, async_mode="asgi")
|
| 19 |
app.mount('/', socketio.ASGIApp(sio))
|
| 20 |
|
| 21 |
# Initialize the scraper and KNet
|
| 22 |
+
scraper_instance = CrawlForAIScraper()
|
| 23 |
+
# scraper_instance = WebScraper()
|
| 24 |
knet = KNet(scraper_instance)
|
| 25 |
|
| 26 |
|
|
|
|
| 50 |
|
| 51 |
async def progress_callback(status):
|
| 52 |
try:
|
| 53 |
+
logger.debug(f"Progress update: {status['progress']}% - {status['message']}")
|
| 54 |
+
await sio.emit("status", {"message": status["message"], "progress": status["progress"]}, room=session_id)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 55 |
except Exception as e:
|
| 56 |
logger.error(f"Error in progress callback: {str(e)}")
|
| 57 |
raise e
|
| 58 |
|
| 59 |
try:
|
| 60 |
+
research_results = await knet.conduct_research(topic, progress_callback)
|
| 61 |
logger.info(f"Research completed for topic: {topic}")
|
| 62 |
await sio.emit("research_complete", research_results, room=session_id)
|
| 63 |
except Exception as e:
|
backend/crawl_ai.py
CHANGED
|
@@ -15,9 +15,10 @@ async def main():
|
|
| 15 |
# Create an instance of AsyncWebCrawler
|
| 16 |
async with AsyncWebCrawler(config=base_browser) as crawler:
|
| 17 |
# Run the crawler on a URL
|
| 18 |
-
result = await crawler.arun(url=sys.argv[1], screenshot=False, cache_mode=CacheMode.BYPASS
|
| 19 |
# Print the extracted content
|
| 20 |
hr = lambda: print(("-" * 80) * 2)
|
|
|
|
| 21 |
print(result.markdown)
|
| 22 |
hr()
|
| 23 |
print(json.dumps(result.media, indent=2))
|
|
@@ -25,6 +26,7 @@ async def main():
|
|
| 25 |
print(json.dumps(result.links, indent=2))
|
| 26 |
hr()
|
| 27 |
print(json.dumps(result.downloaded_files, indent=2))
|
|
|
|
| 28 |
|
| 29 |
# if result.success:
|
| 30 |
# # Save screenshot
|
|
|
|
| 15 |
# Create an instance of AsyncWebCrawler
|
| 16 |
async with AsyncWebCrawler(config=base_browser) as crawler:
|
| 17 |
# Run the crawler on a URL
|
| 18 |
+
result = await crawler.arun(url=sys.argv[1], screenshot=False, cache_mode=CacheMode.BYPASS)
|
| 19 |
# Print the extracted content
|
| 20 |
hr = lambda: print(("-" * 80) * 2)
|
| 21 |
+
hr()
|
| 22 |
print(result.markdown)
|
| 23 |
hr()
|
| 24 |
print(json.dumps(result.media, indent=2))
|
|
|
|
| 26 |
print(json.dumps(result.links, indent=2))
|
| 27 |
hr()
|
| 28 |
print(json.dumps(result.downloaded_files, indent=2))
|
| 29 |
+
hr()
|
| 30 |
|
| 31 |
# if result.success:
|
| 32 |
# # Save screenshot
|
backend/knet.py
CHANGED
|
@@ -6,7 +6,6 @@ import json
|
|
| 6 |
import os
|
| 7 |
from datetime import datetime
|
| 8 |
from dotenv import load_dotenv
|
| 9 |
-
from scraper import WebScraper, CrawlForAIScraper
|
| 10 |
from research_node import ResearchNode
|
| 11 |
from collections import deque
|
| 12 |
import asyncio
|
|
@@ -20,10 +19,12 @@ class ResearchProgress:
|
|
| 20 |
self.progress = 0
|
| 21 |
self.callback = callback
|
| 22 |
|
| 23 |
-
def update(self, progress: int, message: str):
|
| 24 |
-
self.progress
|
|
|
|
|
|
|
| 25 |
if self.callback:
|
| 26 |
-
self.callback({"progress": progress, "message": message})
|
| 27 |
|
| 28 |
|
| 29 |
class KNet:
|
|
@@ -39,19 +40,20 @@ class KNet:
|
|
| 39 |
"gemini-2.0-flash-lite-preview-02-05",
|
| 40 |
generation_config={"temperature": 0.7},
|
| 41 |
)
|
|
|
|
| 42 |
|
| 43 |
self.research_manager = genai.GenerativeModel(
|
| 44 |
"gemini-2.0-flash-lite-preview-02-05",
|
| 45 |
generation_config={"temperature": 0.3},
|
| 46 |
)
|
|
|
|
| 47 |
|
| 48 |
# Initialize scraper
|
| 49 |
self.scraper = scraper_instance
|
| 50 |
|
| 51 |
self.logger = logging.getLogger(__name__)
|
| 52 |
-
self.max_depth =
|
| 53 |
self.max_breadth = 3
|
| 54 |
-
self.min_importance_score = 0.6
|
| 55 |
|
| 56 |
self.search_prompt = """Generate 3-5 specific search queries to research the following topic: {topic}
|
| 57 |
|
|
@@ -68,7 +70,8 @@ class KNet:
|
|
| 68 |
Current Topic: {query}
|
| 69 |
Current Depth: {depth}
|
| 70 |
Path from Root: {path}
|
| 71 |
-
Key Findings:
|
|
|
|
| 72 |
|
| 73 |
Consider:
|
| 74 |
1. Relevance to main topic
|
|
@@ -90,7 +93,7 @@ class KNet:
|
|
| 90 |
"response_mime_type": "application/json",
|
| 91 |
}
|
| 92 |
|
| 93 |
-
# Analysis schema
|
| 94 |
self.analysis_schema = {
|
| 95 |
"response_schema": content.Schema(
|
| 96 |
type=content.Type.OBJECT,
|
|
@@ -115,35 +118,32 @@ class KNet:
|
|
| 115 |
def _track_tokens(self, tokens: int) -> None:
|
| 116 |
self.token_count += tokens
|
| 117 |
|
| 118 |
-
def _should_branch_deeper(self, node: ResearchNode) -> bool:
|
| 119 |
-
findings
|
| 120 |
if node.data:
|
| 121 |
-
findings = "\n".join(
|
| 122 |
-
|
| 123 |
-
|
| 124 |
-
|
| 125 |
-
|
| 126 |
-
]
|
| 127 |
-
)
|
| 128 |
|
|
|
|
| 129 |
prompt = self.branch_decision_prompt.format(
|
| 130 |
query=node.query,
|
| 131 |
depth=node.depth,
|
| 132 |
path=" -> ".join(node.get_path_to_root()),
|
| 133 |
-
findings=
|
| 134 |
)
|
| 135 |
-
|
| 136 |
response = self.research_manager.generate_content(
|
| 137 |
prompt, generation_config={**self.branch_schema}
|
| 138 |
)
|
| 139 |
self._track_tokens(response.usage_metadata.total_token_count)
|
| 140 |
-
|
| 141 |
result = json.loads(response.text)
|
| 142 |
self.logger.info(f"Branch decision for '{node.query}': {result['decision']}")
|
| 143 |
|
| 144 |
return result["decision"]
|
| 145 |
|
| 146 |
-
def conduct_research(self, topic: str, progress_callback=None) -> Dict[str, Any]:
|
| 147 |
self.token_count = 0
|
| 148 |
progress = ResearchProgress(progress_callback)
|
| 149 |
self.logger.info(f"Starting research on topic: {topic}")
|
|
@@ -151,88 +151,79 @@ class KNet:
|
|
| 151 |
try:
|
| 152 |
root_node = ResearchNode(topic)
|
| 153 |
to_explore = deque([(root_node, 0)]) # (node, depth) pairs
|
| 154 |
-
explored_queries = set()
|
| 155 |
-
max_branches = self.max_breadth
|
| 156 |
|
| 157 |
-
progress.update(
|
| 158 |
|
| 159 |
-
while to_explore
|
| 160 |
current_node, current_depth = to_explore.popleft()
|
| 161 |
|
| 162 |
-
if current_node.query in explored_queries or current_depth >= self.max_depth:
|
| 163 |
continue
|
| 164 |
|
| 165 |
self.logger.info(f"Exploring: {current_node.query} (Depth: {current_depth})")
|
| 166 |
-
progress.update(
|
| 167 |
|
| 168 |
# Search and scrape
|
| 169 |
-
current_node.data = self.scraper.search_and_scrape(current_node.query)
|
|
|
|
| 170 |
explored_queries.add(current_node.query)
|
| 171 |
|
| 172 |
# Only branch if we have data and haven't reached max depth
|
| 173 |
if current_node.data and current_depth < self.max_depth:
|
| 174 |
-
if self._should_branch_deeper(current_node):
|
| 175 |
-
new_branches = self._analyze_and_branch(current_node)
|
| 176 |
for branch in new_branches:
|
| 177 |
to_explore.append((branch, current_depth + 1))
|
| 178 |
-
self.logger.info(f"Added {len(new_branches)} new
|
| 179 |
|
| 180 |
# Generate final report
|
| 181 |
-
progress.update(
|
| 182 |
final_report = self._generate_final_report(root_node)
|
| 183 |
-
final_report["metadata"]["total_tokens"] = self.token_count
|
| 184 |
|
| 185 |
-
self.logger.info(
|
| 186 |
-
|
| 187 |
-
)
|
| 188 |
-
progress.update(100, "Research complete!")
|
| 189 |
|
|
|
|
|
|
|
| 190 |
return final_report
|
| 191 |
|
| 192 |
except Exception as e:
|
| 193 |
self.logger.error(f"Research failed: {str(e)}")
|
| 194 |
raise e
|
| 195 |
|
| 196 |
-
def _analyze_and_branch(self, node: ResearchNode) -> List[ResearchNode]:
|
| 197 |
if not node.data:
|
| 198 |
return []
|
| 199 |
|
| 200 |
-
|
| 201 |
-
f"- {d.get('title', 'Untitled')}: {d.get('summary', d.get('text', '')[:200])}"
|
| 202 |
-
for d in node.data[:3] if d
|
| 203 |
-
])
|
| 204 |
-
|
| 205 |
-
analysis_prompt = f"""Based on the following findings about "{node.query}", suggest new research directions.
|
| 206 |
-
|
| 207 |
Findings:
|
| 208 |
-
{
|
| 209 |
|
| 210 |
-
Suggest up to
|
| 211 |
-
|
| 212 |
-
|
| 213 |
-
|
| 214 |
|
| 215 |
-
Return as JSON array of objects with
|
| 216 |
-
- importance (0.0-1.0)
|
| 217 |
- query (string)"""
|
| 218 |
|
| 219 |
try:
|
| 220 |
response = self.research_manager.generate_content(
|
| 221 |
-
analysis_prompt,
|
| 222 |
-
generation_config={**self.analysis_schema},
|
| 223 |
)
|
| 224 |
self._track_tokens(response.usage_metadata.total_token_count)
|
| 225 |
-
|
| 226 |
result = json.loads(response.text)
|
| 227 |
self.logger.info(f"New branches for '{node.query}': {result['branches']}")
|
| 228 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 229 |
new_nodes = []
|
| 230 |
for branch in result.get("branches", []):
|
| 231 |
-
|
| 232 |
-
|
| 233 |
-
child_node.importance_score = branch["importance"]
|
| 234 |
-
new_nodes.append(child_node)
|
| 235 |
-
|
| 236 |
return new_nodes
|
| 237 |
|
| 238 |
except Exception as e:
|
|
@@ -240,78 +231,35 @@ class KNet:
|
|
| 240 |
return []
|
| 241 |
|
| 242 |
def _generate_final_report(self, root_node: ResearchNode) -> Dict[str, Any]:
|
| 243 |
-
|
| 244 |
-
|
| 245 |
-
|
| 246 |
-
|
| 247 |
-
|
| 248 |
-
|
| 249 |
-
return all_data
|
| 250 |
-
|
| 251 |
-
all_research_data = collect_data(root_node)
|
| 252 |
-
|
| 253 |
-
# Generate part 1 of the report
|
| 254 |
-
part1_prompt = f"""Generate part 1 of a research report focusing on overview and key findings.
|
| 255 |
-
Main Topic: {root_node.query}
|
| 256 |
-
|
| 257 |
-
Structure for Part 1:
|
| 258 |
-
1. Executive Summary (brief overview)
|
| 259 |
-
2. Key Findings (main discoveries and insights)
|
| 260 |
-
|
| 261 |
-
Keep it concise and focused. Part 2 will cover detailed analysis and references."""
|
| 262 |
-
|
| 263 |
-
response1 = self.research_manager.generate_content(part1_prompt)
|
| 264 |
-
self._track_tokens(response1.usage_metadata.total_token_count)
|
| 265 |
-
part1_content = response1.text
|
| 266 |
-
|
| 267 |
-
# Generate part 2 with awareness of part 1
|
| 268 |
-
part2_prompt = f"""Generate part 2 of the research report. Here's part 1 for context:
|
| 269 |
-
|
| 270 |
-
{part1_content}
|
| 271 |
-
|
| 272 |
-
Now continue with:
|
| 273 |
-
1. Detailed Analysis (expand on the key findings)
|
| 274 |
-
2. Related Topics and Branches (explore connections)
|
| 275 |
-
3. Sources and References (cite sources)
|
| 276 |
-
|
| 277 |
-
Focus on details that complement part 1 without repeating the same information."""
|
| 278 |
-
|
| 279 |
-
response2 = self.research_manager.generate_content(part2_prompt)
|
| 280 |
-
self._track_tokens(response2.usage_metadata.total_token_count)
|
| 281 |
-
|
| 282 |
-
# Combine reports with clear section separation
|
| 283 |
-
report_content = f"""# Research Report: {root_node.query}
|
| 284 |
-
|
| 285 |
-
Part 1: Overview and Key Findings
|
| 286 |
-
--------------------------------
|
| 287 |
-
{part1_content}
|
| 288 |
-
|
| 289 |
-
Part 2: Detailed Analysis and References
|
| 290 |
-
--------------------------------------
|
| 291 |
-
{response2.text}"""
|
| 292 |
|
| 293 |
-
#
|
| 294 |
media_content = {"images": [], "videos": [], "links": [], "references": []}
|
| 295 |
-
|
| 296 |
-
for data in
|
| 297 |
if data.get("images"):
|
| 298 |
media_content["images"].extend(data["images"])
|
| 299 |
if data.get("videos"):
|
| 300 |
media_content["videos"].extend(data["videos"])
|
| 301 |
if data.get("links"):
|
| 302 |
-
media_content["links"].
|
| 303 |
-
|
| 304 |
-
|
| 305 |
-
|
| 306 |
-
|
| 307 |
-
|
| 308 |
-
)
|
| 309 |
|
| 310 |
# Build research tree structure
|
| 311 |
def build_tree_structure(node: ResearchNode) -> Dict:
|
|
|
|
|
|
|
| 312 |
return {
|
| 313 |
"query": node.query,
|
| 314 |
-
"importance": node.importance_score,
|
| 315 |
"depth": node.depth,
|
| 316 |
"children": [build_tree_structure(child) for child in node.children],
|
| 317 |
}
|
|
@@ -319,11 +267,12 @@ Part 2: Detailed Analysis and References
|
|
| 319 |
return {
|
| 320 |
"topic": root_node.query,
|
| 321 |
"timestamp": datetime.now().isoformat(),
|
| 322 |
-
"content":
|
| 323 |
"media": media_content,
|
| 324 |
"research_tree": build_tree_structure(root_node),
|
| 325 |
"metadata": {
|
| 326 |
-
"
|
|
|
|
| 327 |
"max_depth_reached": root_node.max_depth(),
|
| 328 |
"total_tokens": self.token_count,
|
| 329 |
},
|
|
|
|
| 6 |
import os
|
| 7 |
from datetime import datetime
|
| 8 |
from dotenv import load_dotenv
|
|
|
|
| 9 |
from research_node import ResearchNode
|
| 10 |
from collections import deque
|
| 11 |
import asyncio
|
|
|
|
| 19 |
self.progress = 0
|
| 20 |
self.callback = callback
|
| 21 |
|
| 22 |
+
async def update(self, progress: int, message: str):
|
| 23 |
+
self.progress += progress
|
| 24 |
+
if self.progress > 100:
|
| 25 |
+
self.progress = 100
|
| 26 |
if self.callback:
|
| 27 |
+
await self.callback({"progress": self.progress, "message": message})
|
| 28 |
|
| 29 |
|
| 30 |
class KNet:
|
|
|
|
| 40 |
"gemini-2.0-flash-lite-preview-02-05",
|
| 41 |
generation_config={"temperature": 0.7},
|
| 42 |
)
|
| 43 |
+
self.ctx_researcher = []
|
| 44 |
|
| 45 |
self.research_manager = genai.GenerativeModel(
|
| 46 |
"gemini-2.0-flash-lite-preview-02-05",
|
| 47 |
generation_config={"temperature": 0.3},
|
| 48 |
)
|
| 49 |
+
self.ctx_manager = []
|
| 50 |
|
| 51 |
# Initialize scraper
|
| 52 |
self.scraper = scraper_instance
|
| 53 |
|
| 54 |
self.logger = logging.getLogger(__name__)
|
| 55 |
+
self.max_depth = 2
|
| 56 |
self.max_breadth = 3
|
|
|
|
| 57 |
|
| 58 |
self.search_prompt = """Generate 3-5 specific search queries to research the following topic: {topic}
|
| 59 |
|
|
|
|
| 70 |
Current Topic: {query}
|
| 71 |
Current Depth: {depth}
|
| 72 |
Path from Root: {path}
|
| 73 |
+
Key Findings:
|
| 74 |
+
{findings}
|
| 75 |
|
| 76 |
Consider:
|
| 77 |
1. Relevance to main topic
|
|
|
|
| 93 |
"response_mime_type": "application/json",
|
| 94 |
}
|
| 95 |
|
| 96 |
+
# Analysis schema
|
| 97 |
self.analysis_schema = {
|
| 98 |
"response_schema": content.Schema(
|
| 99 |
type=content.Type.OBJECT,
|
|
|
|
| 118 |
def _track_tokens(self, tokens: int) -> None:
|
| 119 |
self.token_count += tokens
|
| 120 |
|
| 121 |
+
def _should_branch_deeper(self, node: ResearchNode, topic: str) -> bool:
|
| 122 |
+
# Generate summary of key findings into research_manager's context
|
| 123 |
if node.data:
|
| 124 |
+
findings = ("\n" + "-"*10 + "Next data" + "-"*10 + "\n").join([json.dumps(d, indent=2) for d in node.data])
|
| 125 |
+
response = self.llm.generate_content(f"Extract key findings from the following data related to the topic '{topic}':\n{findings}")
|
| 126 |
+
self._track_tokens(response.usage_metadata.total_token_count)
|
| 127 |
+
findings = response.text
|
| 128 |
+
self.ctx_manager.append(findings)
|
|
|
|
|
|
|
| 129 |
|
| 130 |
+
# Research manager takes decision to proceed or not
|
| 131 |
prompt = self.branch_decision_prompt.format(
|
| 132 |
query=node.query,
|
| 133 |
depth=node.depth,
|
| 134 |
path=" -> ".join(node.get_path_to_root()),
|
| 135 |
+
findings="\n".join(self.ctx_manager),
|
| 136 |
)
|
|
|
|
| 137 |
response = self.research_manager.generate_content(
|
| 138 |
prompt, generation_config={**self.branch_schema}
|
| 139 |
)
|
| 140 |
self._track_tokens(response.usage_metadata.total_token_count)
|
|
|
|
| 141 |
result = json.loads(response.text)
|
| 142 |
self.logger.info(f"Branch decision for '{node.query}': {result['decision']}")
|
| 143 |
|
| 144 |
return result["decision"]
|
| 145 |
|
| 146 |
+
async def conduct_research(self, topic: str, progress_callback=None) -> Dict[str, Any]:
|
| 147 |
self.token_count = 0
|
| 148 |
progress = ResearchProgress(progress_callback)
|
| 149 |
self.logger.info(f"Starting research on topic: {topic}")
|
|
|
|
| 151 |
try:
|
| 152 |
root_node = ResearchNode(topic)
|
| 153 |
to_explore = deque([(root_node, 0)]) # (node, depth) pairs
|
| 154 |
+
explored_queries = set() # {string, string, ...}
|
|
|
|
| 155 |
|
| 156 |
+
await progress.update(5, "Starting research...")
|
| 157 |
|
| 158 |
+
while to_explore:
|
| 159 |
current_node, current_depth = to_explore.popleft()
|
| 160 |
|
| 161 |
+
if (current_node.query in explored_queries or current_depth >= self.max_depth):
|
| 162 |
continue
|
| 163 |
|
| 164 |
self.logger.info(f"Exploring: {current_node.query} (Depth: {current_depth})")
|
| 165 |
+
await progress.update(5, f"Exploring: {current_node.query}")
|
| 166 |
|
| 167 |
# Search and scrape
|
| 168 |
+
current_node.data = await self.scraper.search_and_scrape(current_node.query, 3) # node -> data = [{url:...}, {url:...}, ...]
|
| 169 |
+
self.ctx_researcher.append(json.dumps(current_node.data, indent=2))
|
| 170 |
explored_queries.add(current_node.query)
|
| 171 |
|
| 172 |
# Only branch if we have data and haven't reached max depth
|
| 173 |
if current_node.data and current_depth < self.max_depth:
|
| 174 |
+
if self._should_branch_deeper(current_node, topic):
|
| 175 |
+
new_branches = self._analyze_and_branch(current_node, topic)
|
| 176 |
for branch in new_branches:
|
| 177 |
to_explore.append((branch, current_depth + 1))
|
| 178 |
+
self.logger.info(f"Added {len(new_branches)} new branch(es) at depth {current_depth + 1}")
|
| 179 |
|
| 180 |
# Generate final report
|
| 181 |
+
await progress.update(30, "Generating comprehensive report...")
|
| 182 |
final_report = self._generate_final_report(root_node)
|
|
|
|
| 183 |
|
| 184 |
+
self.logger.info(f"Research completed. Explored {len(explored_queries)} queries across {root_node.max_depth()} levels")
|
| 185 |
+
await progress.update(100, "Research complete!")
|
|
|
|
|
|
|
| 186 |
|
| 187 |
+
with open("output.json", "w") as f:
|
| 188 |
+
json.dump(final_report, f, indent=2)
|
| 189 |
return final_report
|
| 190 |
|
| 191 |
except Exception as e:
|
| 192 |
self.logger.error(f"Research failed: {str(e)}")
|
| 193 |
raise e
|
| 194 |
|
| 195 |
+
def _analyze_and_branch(self, node: ResearchNode, topic: str) -> List[ResearchNode]:
|
| 196 |
if not node.data:
|
| 197 |
return []
|
| 198 |
|
| 199 |
+
analysis_prompt = f"""Based on the following findings about "{topic}", suggest new research directions.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 200 |
Findings:
|
| 201 |
+
{json.dumps(self.ctx_manager, indent=2)}
|
| 202 |
|
| 203 |
+
Suggest up to {self.max_breadth} specific google search queries that would help data which:
|
| 204 |
+
- Builds upon these findings
|
| 205 |
+
- Explores different aspects
|
| 206 |
+
- Goes deeper into important details
|
| 207 |
|
| 208 |
+
Return as JSON array of objects with properties:
|
|
|
|
| 209 |
- query (string)"""
|
| 210 |
|
| 211 |
try:
|
| 212 |
response = self.research_manager.generate_content(
|
| 213 |
+
analysis_prompt, generation_config={**self.analysis_schema}
|
|
|
|
| 214 |
)
|
| 215 |
self._track_tokens(response.usage_metadata.total_token_count)
|
|
|
|
| 216 |
result = json.loads(response.text)
|
| 217 |
self.logger.info(f"New branches for '{node.query}': {result['branches']}")
|
| 218 |
|
| 219 |
+
# Add children to current node
|
| 220 |
+
# +> child1
|
| 221 |
+
# node - +> child2
|
| 222 |
+
# +> child3
|
| 223 |
new_nodes = []
|
| 224 |
for branch in result.get("branches", []):
|
| 225 |
+
child_node = node.add_child(branch["query"])
|
| 226 |
+
new_nodes.append(child_node)
|
|
|
|
|
|
|
|
|
|
| 227 |
return new_nodes
|
| 228 |
|
| 229 |
except Exception as e:
|
|
|
|
| 231 |
return []
|
| 232 |
|
| 233 |
def _generate_final_report(self, root_node: ResearchNode) -> Dict[str, Any]:
|
| 234 |
+
findings = "\n".join(self.ctx_researcher)
|
| 235 |
+
prompt = f"""Generate a comprehensive report on the topic "{root_node.query}" based on the following research findings:
|
| 236 |
+
{findings}
|
| 237 |
+
"""
|
| 238 |
+
response = self.research_manager.generate_content(prompt)
|
| 239 |
+
self._track_tokens(response.usage_metadata.total_token_count)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 240 |
|
| 241 |
+
# Collate multimedia content
|
| 242 |
media_content = {"images": [], "videos": [], "links": [], "references": []}
|
| 243 |
+
all_sources_data = root_node.get_all_data()
|
| 244 |
+
for data in all_sources_data:
|
| 245 |
if data.get("images"):
|
| 246 |
media_content["images"].extend(data["images"])
|
| 247 |
if data.get("videos"):
|
| 248 |
media_content["videos"].extend(data["videos"])
|
| 249 |
if data.get("links"):
|
| 250 |
+
media_content["links"].extend([{"url": l["href"], "text": l["text"]} for l in data["links"]])
|
| 251 |
+
# Deduplicate
|
| 252 |
+
media_content["images"] = list(set(media_content["images"]))
|
| 253 |
+
media_content["videos"] = list(set(media_content["videos"]))
|
| 254 |
+
media_content["links"] = list({json.dumps(d, sort_keys=True) for d in media_content["links"]})
|
| 255 |
+
media_content["links"] = [json.loads(d) for d in media_content["links"]]
|
|
|
|
| 256 |
|
| 257 |
# Build research tree structure
|
| 258 |
def build_tree_structure(node: ResearchNode) -> Dict:
|
| 259 |
+
if not node:
|
| 260 |
+
return {}
|
| 261 |
return {
|
| 262 |
"query": node.query,
|
|
|
|
| 263 |
"depth": node.depth,
|
| 264 |
"children": [build_tree_structure(child) for child in node.children],
|
| 265 |
}
|
|
|
|
| 267 |
return {
|
| 268 |
"topic": root_node.query,
|
| 269 |
"timestamp": datetime.now().isoformat(),
|
| 270 |
+
"content": response.text,
|
| 271 |
"media": media_content,
|
| 272 |
"research_tree": build_tree_structure(root_node),
|
| 273 |
"metadata": {
|
| 274 |
+
"total_queries": root_node.total_children(),
|
| 275 |
+
"total_sources": len(all_sources_data),
|
| 276 |
"max_depth_reached": root_node.max_depth(),
|
| 277 |
"total_tokens": self.token_count,
|
| 278 |
},
|
backend/output.json
CHANGED
|
@@ -1,330 +1,652 @@
|
|
| 1 |
{
|
| 2 |
-
|
| 3 |
-
|
| 4 |
-
|
| 5 |
|
| 6 |
-
|
| 7 |
-
--------------------------------
|
| 8 |
-
## Part 1: Sleep and Its Importance - Overview and Key Findings
|
| 9 |
|
| 10 |
-
|
| 11 |
|
| 12 |
-
|
| 13 |
|
| 14 |
-
|
| 15 |
|
| 16 |
-
|
| 17 |
|
| 18 |
-
|
| 19 |
|
| 20 |
-
* **
|
|
|
|
|
|
|
|
|
|
| 21 |
|
| 22 |
-
|
| 23 |
|
|
|
|
| 24 |
|
| 25 |
-
|
| 26 |
-
|
| 27 |
-
|
| 28 |
|
| 29 |
-
|
| 30 |
|
| 31 |
-
|
| 32 |
|
| 33 |
-
* **
|
|
|
|
|
|
|
| 34 |
|
| 35 |
-
|
| 36 |
|
| 37 |
-
|
| 38 |
|
| 39 |
-
|
|
|
|
|
|
|
| 40 |
|
| 41 |
-
|
| 42 |
|
| 43 |
-
|
| 44 |
|
| 45 |
-
* **
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 46 |
|
| 47 |
-
|
| 48 |
|
| 49 |
-
|
| 50 |
|
| 51 |
-
|
| 52 |
|
| 53 |
-
|
| 54 |
|
| 55 |
-
|
| 56 |
|
| 57 |
-
|
| 58 |
|
| 59 |
-
|
| 60 |
|
| 61 |
-
|
| 62 |
-
|
| 63 |
-
**3. Sources and References**
|
| 64 |
-
|
| 65 |
-
This section provides a list of sources and references used in the research report.
|
| 66 |
-
|
| 67 |
-
* **Carskadon, M. A., & Dement, W. C. (2011). Normal human sleep: An overview. *Principles and practice of sleep medicine*, 5, 16-26.** (Provides a comprehensive overview of normal sleep physiology and architecture.)
|
| 68 |
-
|
| 69 |
-
* **Walker, M. P. (2009). The role of sleep in cognition and emotion. *Annals of the New York Academy of Sciences*, *1156*, 168-197.** (Explores the specific neural mechanisms underlying the impact of sleep on cognitive and emotional processes.)
|
| 70 |
-
|
| 71 |
-
* **Gangwisch, J. E. (2009). Sleep and metabolic function. *Sleep*, *32*(8), 981-988.** (Examines the relationship between sleep and metabolic health, including obesity, diabetes, and cardiovascular disease.)
|
| 72 |
-
|
| 73 |
-
* **Riemann, D., Baglioni, C., Bassetti, C., Bjorvatn, B., Bonnet, M. H., & Espie, C. A. (2010). European guideline for the diagnosis and treatment of insomnia. *Journal of Sleep Research*, *19*(2), 137-160.** (Provides guidelines for the diagnosis and treatment of insomnia.)
|
| 74 |
-
|
| 75 |
-
* **National Institutes of Health (NIH). (2023). *Sleep Disorders*. Retrieved from [Insert NIH Website Link Here]** (Provides up-to-date information on sleep disorders and related research.)
|
| 76 |
-
|
| 77 |
-
* **American Academy of Sleep Medicine (AASM). (2023). *Sleep Education*. Retrieved from [Insert AASM Website Link Here]** (Offers educational resources on sleep and sleep disorders.)
|
| 78 |
-
|
| 79 |
-
* **Czeisler, C. A., Duffy, J. F., Shanahan, T. L., Brown, E. N., Rimmer, D. W., Ronda, J. M., ... & Kronauer, R. E. (1999). Stability, precision, and near-24-hour period of the human circadian pacemaker. *Science*, *284*(5423), 1329-1332.** (Provides research on the human circadian pacemaker.)
|
| 80 |
-
|
| 81 |
-
* **Goel, N., Rao, H., Durkin, P. R., & D'Ambrosio, D. (2013). Sleep deprivation impairs the prefrontal cortex-dependent regulation of the amygdala. *Journal of Neuroscience*, *33*(28), 11639-11646.** (Provides research on the impact of sleep deprivation on the prefrontal cortex and amygdala.)
|
| 82 |
-
|
| 83 |
-
* **Mander, B. A., Winer, J. R., Jagust, W. J., & Walker, M. P. (2016). Sleep disturbance and the aging brain. *Trends in Neurosciences*, *39*(1), 33-43.** (Provides research on the impact of sleep on the aging brain.)
|
| 84 |
-
|
| 85 |
-
**(Note: Replace the bracketed placeholders with the actual website links.)**
|
| 86 |
",
|
| 87 |
-
|
| 88 |
-
|
| 89 |
-
|
| 90 |
-
|
| 91 |
-
|
| 92 |
-
|
| 93 |
-
|
| 94 |
-
|
| 95 |
-
|
| 96 |
-
|
| 97 |
-
|
| 98 |
-
|
| 99 |
-
|
| 100 |
-
|
| 101 |
-
|
| 102 |
-
|
| 103 |
-
|
| 104 |
-
|
| 105 |
-
|
| 106 |
-
|
| 107 |
-
|
| 108 |
-
|
| 109 |
-
|
| 110 |
-
|
| 111 |
-
|
| 112 |
-
|
| 113 |
-
|
| 114 |
-
|
| 115 |
-
|
| 116 |
-
|
| 117 |
-
|
| 118 |
-
|
| 119 |
-
|
| 120 |
-
|
| 121 |
-
|
| 122 |
-
|
| 123 |
-
|
| 124 |
-
|
| 125 |
-
|
| 126 |
-
|
| 127 |
-
|
| 128 |
-
|
| 129 |
-
|
| 130 |
-
|
| 131 |
-
|
| 132 |
-
|
| 133 |
-
|
| 134 |
-
|
| 135 |
-
|
| 136 |
-
|
| 137 |
-
|
| 138 |
-
|
| 139 |
-
|
| 140 |
-
|
| 141 |
-
|
| 142 |
-
|
| 143 |
-
|
| 144 |
-
|
| 145 |
-
|
| 146 |
-
|
| 147 |
-
|
| 148 |
-
|
| 149 |
-
|
| 150 |
-
|
| 151 |
-
|
| 152 |
-
|
| 153 |
-
|
| 154 |
-
|
| 155 |
-
|
| 156 |
-
|
| 157 |
-
|
| 158 |
-
|
| 159 |
-
|
| 160 |
-
|
| 161 |
-
|
| 162 |
-
|
| 163 |
-
|
| 164 |
-
|
| 165 |
-
|
| 166 |
-
|
| 167 |
-
|
| 168 |
-
|
| 169 |
-
|
| 170 |
-
|
| 171 |
-
|
| 172 |
-
|
| 173 |
-
|
| 174 |
-
|
| 175 |
-
|
| 176 |
-
|
| 177 |
-
|
| 178 |
-
|
| 179 |
-
|
| 180 |
-
|
| 181 |
-
|
| 182 |
-
|
| 183 |
-
|
| 184 |
-
|
| 185 |
-
|
| 186 |
-
|
| 187 |
-
|
| 188 |
-
|
| 189 |
-
|
| 190 |
-
|
| 191 |
-
|
| 192 |
-
|
| 193 |
-
|
| 194 |
-
|
| 195 |
-
|
| 196 |
-
|
| 197 |
-
|
| 198 |
-
|
| 199 |
-
|
| 200 |
-
|
| 201 |
-
|
| 202 |
-
|
| 203 |
-
|
| 204 |
-
|
| 205 |
-
|
| 206 |
-
|
| 207 |
-
|
| 208 |
-
|
| 209 |
-
|
| 210 |
-
|
| 211 |
-
|
| 212 |
-
|
| 213 |
-
|
| 214 |
-
|
| 215 |
-
|
| 216 |
-
|
| 217 |
-
|
| 218 |
-
|
| 219 |
-
|
| 220 |
-
|
| 221 |
-
|
| 222 |
-
|
| 223 |
-
|
| 224 |
-
|
| 225 |
-
|
| 226 |
-
|
| 227 |
-
|
| 228 |
-
|
| 229 |
-
|
| 230 |
-
|
| 231 |
-
|
| 232 |
-
|
| 233 |
-
|
| 234 |
-
|
| 235 |
-
|
| 236 |
-
|
| 237 |
-
|
| 238 |
-
|
| 239 |
-
|
| 240 |
-
|
| 241 |
-
|
| 242 |
-
|
| 243 |
-
|
| 244 |
-
|
| 245 |
-
|
| 246 |
-
|
| 247 |
-
|
| 248 |
-
|
| 249 |
-
|
| 250 |
-
|
| 251 |
-
|
| 252 |
-
|
| 253 |
-
|
| 254 |
-
|
| 255 |
-
|
| 256 |
-
|
| 257 |
-
|
| 258 |
-
|
| 259 |
-
|
| 260 |
-
"
|
| 261 |
-
|
| 262 |
-
|
| 263 |
-
|
| 264 |
-
"
|
| 265 |
-
"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 266 |
"children": [
|
| 267 |
-
|
| 268 |
-
|
| 269 |
-
|
| 270 |
-
|
| 271 |
-
|
| 272 |
-
|
| 273 |
-
|
| 274 |
-
|
| 275 |
-
|
| 276 |
-
|
| 277 |
-
|
| 278 |
-
|
| 279 |
-
|
| 280 |
-
|
| 281 |
-
|
| 282 |
-
"children": []
|
| 283 |
-
},
|
| 284 |
-
{
|
| 285 |
-
"query": "Determine the reversibility of sleep deprivation-induced cognitive impairments and the underlying molecular mechanisms, including the potential for interventions targeting epigenetic modifications or neurotransmitter imbalances to restore cognitive function.",
|
| 286 |
-
"importance": 0.7,
|
| 287 |
-
"depth": 2,
|
| 288 |
-
"children": []
|
| 289 |
-
}
|
| 290 |
-
]
|
| 291 |
-
},
|
| 292 |
-
{
|
| 293 |
-
"query": "Explore the relationship between sleep quality, duration, and the development of chronic diseases (e.g., cardiovascular disease, diabetes) across different age groups and demographics.",
|
| 294 |
-
"importance": 0.8,
|
| 295 |
-
"depth": 1,
|
| 296 |
-
"children": [
|
| 297 |
-
{
|
| 298 |
-
"query": "Investigate the mediating role of specific lifestyle behaviors (e.g., diet, physical activity, substance use) in the relationship between sleep duration/quality and the development of cardiovascular disease, diabetes, and obesity across different age groups and demographics. This should include longitudinal studies to establish causality.",
|
| 299 |
-
"importance": 0.9,
|
| 300 |
-
"depth": 2,
|
| 301 |
-
"children": []
|
| 302 |
-
},
|
| 303 |
-
{
|
| 304 |
-
"query": "Conduct a comparative analysis of the impact of different sleep disorders (e.g., insomnia, sleep apnea, restless legs syndrome) on the risk of developing chronic diseases, considering age, gender, and socioeconomic status. Explore potential mechanisms linking specific sleep disorders to disease development.",
|
| 305 |
-
"importance": 0.85,
|
| 306 |
-
"depth": 2,
|
| 307 |
-
"children": []
|
| 308 |
-
},
|
| 309 |
-
{
|
| 310 |
-
"query": "Examine the bidirectional relationship between sleep and chronic diseases. Specifically, how do chronic diseases impact sleep quality and duration, and how do changes in sleep patterns influence the progression and management of these diseases? This should include studies on the impact of disease treatments on sleep.",
|
| 311 |
-
"importance": 0.75,
|
| 312 |
-
"depth": 2,
|
| 313 |
-
"children": []
|
| 314 |
-
}
|
| 315 |
-
]
|
| 316 |
-
},
|
| 317 |
-
{
|
| 318 |
-
"query": "Conduct a longitudinal study to assess the impact of consistent sleep schedules and interventions promoting healthy sleep habits on mental health outcomes, including mood regulation and resilience to stress.",
|
| 319 |
-
"importance": 0.7,
|
| 320 |
-
"depth": 1,
|
| 321 |
-
"children": []
|
| 322 |
-
}
|
| 323 |
]
|
| 324 |
-
|
| 325 |
-
|
| 326 |
-
|
| 327 |
-
|
| 328 |
-
|
| 329 |
-
|
|
|
|
|
|
|
|
|
|
| 330 |
}
|
|
|
|
| 1 |
{
|
| 2 |
+
"topic": "what are the different types of deep learning models",
|
| 3 |
+
"timestamp": "2025-03-07T18:04:44.789202",
|
| 4 |
+
"content": "Here's a comprehensive report on the different types of deep learning models, based on the provided research findings:
|
| 5 |
|
| 6 |
+
## Deep Dive into Deep Learning Models
|
|
|
|
|
|
|
| 7 |
|
| 8 |
+
Deep learning, a subset of machine learning, utilizes artificial neural networks to learn from data. These networks, inspired by the human brain, are composed of interconnected layers of nodes (neurons) that process and transform data. This report provides an overview of deep learning, its applications, benefits, challenges, and, most importantly, the various types of deep learning models.
|
| 9 |
|
| 10 |
+
### 1. What is Deep Learning?
|
| 11 |
|
| 12 |
+
Deep learning is a type of machine learning that employs artificial neural networks to learn from data. These networks are composed of multiple layers of interconnected nodes, each responsible for learning specific features of the data. The process involves training deep learning algorithms on large datasets of labeled data, allowing them to associate features with the correct labels. Once trained, these algorithms can make predictions on new, unseen data.
|
| 13 |
|
| 14 |
+
### 2. Deep Learning Applications
|
| 15 |
|
| 16 |
+
Deep learning has found applications in a wide array of fields, including:
|
| 17 |
|
| 18 |
+
* **Image Recognition:** Identifying objects and features in images.
|
| 19 |
+
* **Natural Language Processing (NLP):** Understanding the meaning of text, enabling applications like chatbots and spam filters.
|
| 20 |
+
* **Finance:** Analyzing financial data and predicting market trends.
|
| 21 |
+
* **Text to Image:** Converting text into images.
|
| 22 |
|
| 23 |
+
### 3. Benefits of Deep Learning Models
|
| 24 |
|
| 25 |
+
Deep learning models offer several advantages over traditional machine learning methods:
|
| 26 |
|
| 27 |
+
* **Learning Complex Relationships:** They can learn intricate relationships between features in data, leading to more powerful and accurate models.
|
| 28 |
+
* **Scalability:** They can be trained on large datasets, allowing them to learn from a wider range of experiences and make more accurate predictions.
|
| 29 |
+
* **Data-Driven Learning:** They require less human intervention, increasing efficiency and scalability.
|
| 30 |
|
| 31 |
+
### 4. Challenges of Deep Learning Models
|
| 32 |
|
| 33 |
+
Despite their benefits, deep learning models also face several challenges:
|
| 34 |
|
| 35 |
+
* **Data Requirements:** They require large amounts of data to learn effectively, which can be a limitation in domains with limited data availability.
|
| 36 |
+
* **Overfitting:** They can overfit the training data, learning the noise rather than the underlying relationships.
|
| 37 |
+
* **Bias:** They can be biased based on the data they are trained on, leading to unfair or inaccurate predictions.
|
| 38 |
|
| 39 |
+
### 5. Types of Deep Learning Models
|
| 40 |
|
| 41 |
+
The research findings highlight three common types of deep learning models:
|
| 42 |
|
| 43 |
+
* **Convolutional Neural Networks (CNNs):** CNNs are specifically designed for image recognition and processing tasks. They excel at identifying objects in images, even when the objects are partially obscured or distorted.
|
| 44 |
+
* **Recurrent Neural Networks (RNNs):** RNNs are well-suited for natural language processing and speech recognition. They are particularly effective at understanding the context of a sentence or phrase and can be used for tasks like text generation and language translation.
|
| 45 |
+
* **Deep Belief Networks (DBNs):** DBNs are generative models composed of multiple layers of stochastic, latent variables. They are used for feature extraction and dimensionality reduction.
|
| 46 |
|
| 47 |
+
### 6. Other Deep Learning Architectures (From External Source)
|
| 48 |
|
| 49 |
+
The provided external source mentions additional deep learning architectures, including:
|
| 50 |
|
| 51 |
+
* **Long Short-Term Memory Networks (LSTMs):** A type of RNN designed to handle long-term dependencies in sequential data, making them suitable for tasks like speech recognition and time series prediction.
|
| 52 |
+
* **Gated Recurrent Unit (GRU):** A variant of LSTM with fewer parameters, making them suitable for smaller datasets.
|
| 53 |
+
* **Generative Adversarial Networks (GANs):** Used for generating realistic data, such as images, videos, and audio, by training two neural networks in a competitive setting.
|
| 54 |
+
* **Transformer Networks:** A powerful architecture that uses self-attention mechanisms to understand context and relationships in sequential data, making them suitable for NLP tasks like translation and text generation.
|
| 55 |
+
* **Autoencoders:** Used for dimensionality reduction, anomaly detection, and feature learning.
|
| 56 |
+
* **Deep Stacking Networks (DSNs):** A set of individual deep networks, each with its own hidden layers, designed to improve training and handle complex classification tasks.
|
| 57 |
|
| 58 |
+
### 7. Semi-Supervised Learning
|
| 59 |
|
| 60 |
+
Semi-supervised learning is a machine learning approach that combines labeled and unlabeled data for training. This is particularly useful when labeled data is scarce and expensive to obtain, while unlabeled data is abundant. The goal is to leverage the information in both types of data to improve model performance.
|
| 61 |
|
| 62 |
+
### 8. Reinforcement Learning
|
| 63 |
|
| 64 |
+
Reinforcement learning is a machine learning paradigm where an agent learns to make decisions in an environment to maximize a reward. The agent interacts with the environment, receives feedback (rewards or penalties), and adjusts its actions to achieve its goals.
|
| 65 |
|
| 66 |
+
### 9. Reinforcement Learning from Human Feedback (RLHF)
|
| 67 |
|
| 68 |
+
RLHF is a technique used to improve large language models (LLMs) by incorporating human preferences into the training process. This involves training a reward model based on human feedback and then using this reward model to fine-tune the LLM using reinforcement learning. This approach has been used in models like ChatGPT and InstructGPT to generate more natural and human-like responses.
|
| 69 |
|
| 70 |
+
### 10. Conclusion
|
| 71 |
|
| 72 |
+
Deep learning models are a powerful tool for solving complex problems across various domains. The choice of the right model depends on the specific task, the nature of the data, and the desired outcome. As the field of deep learning continues to evolve, new architectures and techniques will emerge, further expanding the capabilities of these models.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 73 |
",
|
| 74 |
+
"media": {
|
| 75 |
+
"images": [
|
| 76 |
+
"https://deepsense.ai/wp-content/uploads/2024/11/Figure-1-Example-usage-of-ChatGPT-to-analyze-worst-case-time-complexity-of-bubble-sorting-in-the-specified-style.jpeg",
|
| 77 |
+
"https://addepto.com/wp-content/uploads/2023/02/featured-images_blog-16.jpg",
|
| 78 |
+
"https://addepto.com/wp-content/uploads/2021/08/at_is_entropy_in_machine_learning_1_.webp",
|
| 79 |
+
"https://deepsense.ai/wp-content/smush-webp/2025/03/Miniatura-Unstructured-1024x614.png.webp",
|
| 80 |
+
"https://deepsense.ai/wp-content/uploads/2024/11/Figure-3-Experiment-video-screenshot.png",
|
| 81 |
+
"https://addepto.com/wp-content/uploads/2020/07/Deep_Learning_Architecture_1_.webp",
|
| 82 |
+
"https://addepto.com/wp-content/uploads/2020/02/ing_-_5_Examples_Of_How_To_Use_It_1_.webp",
|
| 83 |
+
"https://i.ytimg.com/vi/olFxW7kdtP8/hqdefault.jpg",
|
| 84 |
+
"https://images.prismic.io/turing/652ec261fbd9a45bcec81941_Reinforcement_Learning_4_11zon_946380769c.webp?auto=format%2Ccompress&fit=max&w=3840",
|
| 85 |
+
"https://www.sas.com/en_ie/insights/articles/analytics/machine-learning-algorithms/_jcr_content/par02/image_8597.img.jpg/1494256305219.jpg",
|
| 86 |
+
"https://www.simplilearn.com/ice9/free_resources_article_thumb/ML-banner_1.jpg",
|
| 87 |
+
"https://www.simplilearn.com/ice9/free_resources_article_thumb/how_to_become_machine_learning_engineer.jpg",
|
| 88 |
+
"https://deepsense.ai/wp-content/uploads/2024/10/6-a-2.png",
|
| 89 |
+
"https://www.simplilearn.com/ice9/free_resources_article_thumb/AI_vs_Machine_Learning_vs_Deep_Learning.jpg",
|
| 90 |
+
"https://www.simplilearn.com/ice9/ebooks/ML_careerguide.jpg",
|
| 91 |
+
"https://www.simplilearn.com/ice9/ebooks/ML_eBook_FRS.jpg",
|
| 92 |
+
"https://www.datarobot.com/wp-content/uploads/2022/02/image3-2-1024x390.jpeg",
|
| 93 |
+
"https://images.prismic.io/turing/65980f9f531ac2845a2728df_reinforcement_learning_example_82cc17b798.webp?auto=format,compress",
|
| 94 |
+
"https://deepsense.ai/wp-content/uploads/2024/11/Figure-8-Fine-tuning-with-Reinforcement-Learning.png",
|
| 95 |
+
"https://www.datarobot.com/wp-content/uploads/2022/02/image5-1-1-1024x892.jpeg",
|
| 96 |
+
"https://images.prismic.io/turing/65980f9e531ac2845a2728de_reinforcement_learning_algorithm_2a63561d9a.webp?auto=format,compress",
|
| 97 |
+
"https://deepsense.ai/wp-content/uploads/2024/11/Figure-7-Reward-model-training.png",
|
| 98 |
+
"https://www.simplilearn.com/ice9/free_resources_article_thumb/Deep_learning_algorithm.jpg",
|
| 99 |
+
"https://media.geeksforgeeks.org/img-practice/prod/courses/554/Mobile/Other/data_science_1720847526.webp",
|
| 100 |
+
"https://media.geeksforgeeks.org/img-practice/prod/courses/405/Mobile/Other/Course_ML_&_DS_1720846555.webp",
|
| 101 |
+
"https://deepsense.ai/wp-content/smush-webp/2025/03/scott-rodgerson-PSpf_XgOM5w-unsplash-1024x683.jpg.webp",
|
| 102 |
+
"https://deepsense.ai/wp-content/smush-webp/2025/03/partnerzy_blog-1024x614.png.webp",
|
| 103 |
+
"https://media.geeksforgeeks.org/img-practice/prod/courses/808/Web/Content/ai-for-kids-webp_1728467027.webp",
|
| 104 |
+
"https://addepto.com/wp-content/uploads/2022/09/featured-images_blog-4.webp",
|
| 105 |
+
"https://ars.els-cdn.com/content/image/3-s2.0-B9781782421795500055-f05-65-9781782421795.jpg",
|
| 106 |
+
"https://www.simplilearn.com/ice9/free_resources_article_thumb/ArtificalNeuralNetwork.PNG",
|
| 107 |
+
"https://deepsense.ai/wp-content/uploads/2024/11/Figure-6-Language-model-pretraining.png",
|
| 108 |
+
"https://media.geeksforgeeks.org/wp-content/uploads/20220805171912/ProposedSemisupervisedLearningProcess.jpg",
|
| 109 |
+
"https://images.prismic.io/turing/65980fa0531ac2845a2728e0_reinforcement_learning_techniques_0d7f3f5e9c.webp?auto=format,compress",
|
| 110 |
+
"https://deepsense.ai/wp-content/uploads/2024/11/Figure-5-ChatGPT-fine-tuning-steps.png",
|
| 111 |
+
"https://addepto.com/wp-content/uploads/2024/12/contextcheck-2.jpg",
|
| 112 |
+
"https://deepsense.ai/wp-content/uploads/2024/11/Figure-4-Reinforcement-learning-from-human-feedback-training-loop.png",
|
| 113 |
+
"https://addepto.com/wp-content/uploads/2023/03/e-learning-consulting-featured-image.webp",
|
| 114 |
+
"https://deepsense.ai/wp-content/uploads/2024/10/6-a-3.png",
|
| 115 |
+
"https://deepsense.ai/wp-content/smush-webp/2024/11/How-can-we-improve-language-models-using-reinforcement-learning-ChatGPT-case-study-1024x303.jpeg.webp",
|
| 116 |
+
"https://deepsense.ai/wp-content/uploads/2024/11/Figure-2-Classic-reinforcement-learning-training-loop.png"
|
| 117 |
+
],
|
| 118 |
+
"videos": [
|
| 119 |
+
"https://www.youtube.com/watch?v=oC7Cw3fu3gU",
|
| 120 |
+
"https://www.youtube.com/@deepsenseai",
|
| 121 |
+
"https://www.youtube.com/googlecloud",
|
| 122 |
+
"https://www.youtube.com/user/Simplilearn",
|
| 123 |
+
"https://www.youtube.com/c/sasusers",
|
| 124 |
+
"https://www.youtube.com/geeksforgeeksvideos",
|
| 125 |
+
"https://www.youtube.com/channel/UCLfxdtB3H0JdnniBnd4DoNA",
|
| 126 |
+
"https://www.youtube.com/SASsoftware",
|
| 127 |
+
"https://www.youtube.com/googlecloudplatform"
|
| 128 |
+
],
|
| 129 |
+
"links": [
|
| 130 |
+
{
|
| 131 |
+
"text": "Learn more about this provider",
|
| 132 |
+
"url": "https://usercentrics.com/privacy-policy/"
|
| 133 |
+
},
|
| 134 |
+
{
|
| 135 |
+
"text": "Contact and support",
|
| 136 |
+
"url": "https://service.elsevier.com/app/contact/supporthub/sciencedirect/"
|
| 137 |
+
},
|
| 138 |
+
{
|
| 139 |
+
"text": "Cookiebot",
|
| 140 |
+
"url": "https://www.cookiebot.com"
|
| 141 |
+
},
|
| 142 |
+
{
|
| 143 |
+
"text": "Learn more about this provider",
|
| 144 |
+
"url": "https://privacy.microsoft.com/en-US/privacystatement"
|
| 145 |
+
},
|
| 146 |
+
{
|
| 147 |
+
"text": "Email",
|
| 148 |
+
"url": "mailto:subject=A%20guide%20to%20the%20types%20of%20machine%20learning%20algorithms%20%7C%20SAS%20Ireland&body=https%3A%2F%2Fwww.sas.com%2Fen_ie%2Finsights%2Farticles%2Fanalytics%2Fmachine-learning-algorithms.html"
|
| 149 |
+
},
|
| 150 |
+
{
|
| 151 |
+
"text": "",
|
| 152 |
+
"url": "https://www.facebook.com/simplilearn"
|
| 153 |
+
},
|
| 154 |
+
{
|
| 155 |
+
"text": "",
|
| 156 |
+
"url": "https://www.facebook.com/addeptoanalytics/"
|
| 157 |
+
},
|
| 158 |
+
{
|
| 159 |
+
"text": "Open In App",
|
| 160 |
+
"url": "https://geeksforgeeksapp.page.link/?link=https://www.geeksforgeeks.org/ml-semi-supervised-learning/?type%3Darticle%26id%3D308122&apn=free.programming.programming&isi=1641848816&ibi=org.geeksforgeeks.GeeksforGeeksDev&efr=1"
|
| 161 |
+
},
|
| 162 |
+
{
|
| 163 |
+
"text": "YouTube Channel",
|
| 164 |
+
"url": "https://www.youtube.com/c/sasusers"
|
| 165 |
+
},
|
| 166 |
+
{
|
| 167 |
+
"text": "YouTube",
|
| 168 |
+
"url": "https://www.youtube.com/SASsoftware"
|
| 169 |
+
},
|
| 170 |
+
{
|
| 171 |
+
"text": "Twitter",
|
| 172 |
+
"url": "https://twitter.com/intent/tweet?text=A%20guide%20to%20the%20types%20of%20machine%20learning%20algorithms%20%7C%20SAS%20Ireland&url=https%3A%2F%2Fwww.sas.com%2Fen_ie%2Finsights%2Farticles%2Fanalytics%2Fmachine-learning-algorithms.html"
|
| 173 |
+
},
|
| 174 |
+
{
|
| 175 |
+
"text": "",
|
| 176 |
+
"url": "https://www.twitter.com/simplilearn"
|
| 177 |
+
},
|
| 178 |
+
{
|
| 179 |
+
"text": "https://arxiv.org/abs/1706.03741",
|
| 180 |
+
"url": "https://arxiv.org/abs/1706.03741"
|
| 181 |
+
},
|
| 182 |
+
{
|
| 183 |
+
"text": "OpenCueOpen source render manager for visual effects and animation.",
|
| 184 |
+
"url": "https://www.opencue.io/docs/getting-started/"
|
| 185 |
+
},
|
| 186 |
+
{
|
| 187 |
+
"text": "Learn more about this provider",
|
| 188 |
+
"url": "https://www.evergage.com/privacy-policy/"
|
| 189 |
+
},
|
| 190 |
+
{
|
| 191 |
+
"text": "Learn more about this provider",
|
| 192 |
+
"url": "https://legal.hubspot.com/privacy-policy?__hstc=198943664.258746a941bc90ce1f5f381e8fb22ac0.1741350864696.1741350864696.1741350864696.1&__hssc=198943664.1.1741350864696&__hsfp=1951743191"
|
| 193 |
+
},
|
| 194 |
+
{
|
| 195 |
+
"text": "Learn more about this provider",
|
| 196 |
+
"url": "https://www.spotify.com/uk/legal/privacy-policy/"
|
| 197 |
+
},
|
| 198 |
+
{
|
| 199 |
+
"text": "",
|
| 200 |
+
"url": "https://www.instagram.com/geeks_for_geeks/"
|
| 201 |
+
},
|
| 202 |
+
{
|
| 203 |
+
"text": "Learn more about this provider",
|
| 204 |
+
"url": "http://www.sitescout.com/privacy"
|
| 205 |
+
},
|
| 206 |
+
{
|
| 207 |
+
"text": "https://openai.com/research/learning-from-human-preferences",
|
| 208 |
+
"url": "https://openai.com/research/learning-from-human-preferences"
|
| 209 |
+
},
|
| 210 |
+
{
|
| 211 |
+
"text": "Google Cloud Community",
|
| 212 |
+
"url": "https://www.googlecloudcommunity.com"
|
| 213 |
+
},
|
| 214 |
+
{
|
| 215 |
+
"text": "",
|
| 216 |
+
"url": "https://pl.linkedin.com/company/addepto"
|
| 217 |
+
},
|
| 218 |
+
{
|
| 219 |
+
"text": "Learn more about this provider",
|
| 220 |
+
"url": "https://legal.hubspot.com/privacy-policy"
|
| 221 |
+
},
|
| 222 |
+
{
|
| 223 |
+
"text": "",
|
| 224 |
+
"url": "https://twitter.com/geeksforgeeks"
|
| 225 |
+
},
|
| 226 |
+
{
|
| 227 |
+
"text": "Learn more about this provider",
|
| 228 |
+
"url": "https://www.linkedin.com/legal/privacy-policy"
|
| 229 |
+
},
|
| 230 |
+
{
|
| 231 |
+
"text": "AppSheetNo-code development platform to build and extend applications.",
|
| 232 |
+
"url": "https://about.appsheet.com/home/"
|
| 233 |
+
},
|
| 234 |
+
{
|
| 235 |
+
"text": "Check it out on Github!",
|
| 236 |
+
"url": "https://github.com/Addepto/contextcheck?utm_source=website&utm_medium=banner&utm_campaign=contextcheck"
|
| 237 |
+
},
|
| 238 |
+
{
|
| 239 |
+
"text": "",
|
| 240 |
+
"url": "https://twitter.com/addepto"
|
| 241 |
+
},
|
| 242 |
+
{
|
| 243 |
+
"text": "Learn more about this provider",
|
| 244 |
+
"url": "https://www.home.neustar/privacy"
|
| 245 |
+
},
|
| 246 |
+
{
|
| 247 |
+
"text": "",
|
| 248 |
+
"url": "https://www.elsevier.com/"
|
| 249 |
+
},
|
| 250 |
+
{
|
| 251 |
+
"text": "",
|
| 252 |
+
"url": "https://www.linkedin.com/company/simplilearn"
|
| 253 |
+
},
|
| 254 |
+
{
|
| 255 |
+
"text": "Press Corner",
|
| 256 |
+
"url": "https://www.googlecloudpresscorner.com"
|
| 257 |
+
},
|
| 258 |
+
{
|
| 259 |
+
"text": "LinkedIn",
|
| 260 |
+
"url": "https://www.linkedin.com/company/sas"
|
| 261 |
+
},
|
| 262 |
+
{
|
| 263 |
+
"text": "Get the Android App",
|
| 264 |
+
"url": "https://play.google.com/store/apps/details?id=com.mobile.simplilearn"
|
| 265 |
+
},
|
| 266 |
+
{
|
| 267 |
+
"text": "",
|
| 268 |
+
"url": "https://www.instagram.com/turingcom"
|
| 269 |
+
},
|
| 270 |
+
{
|
| 271 |
+
"text": "Learn more about this provider",
|
| 272 |
+
"url": "https://www.techtarget.com/privacy-policy-may25/?utm_source=cmp&utm_medium=banner&utm_campaign=consent&utm_term=privacy"
|
| 273 |
+
},
|
| 274 |
+
{
|
| 275 |
+
"text": "",
|
| 276 |
+
"url": "https://in.linkedin.com/company/geeksforgeeks"
|
| 277 |
+
},
|
| 278 |
+
{
|
| 279 |
+
"text": "https://openai.com/blog/chatgpt/",
|
| 280 |
+
"url": "https://openai.com/blog/chatgpt/"
|
| 281 |
+
},
|
| 282 |
+
{
|
| 283 |
+
"text": "Chrome EnterpriseChromeOS, Chrome Browser, and Chrome devices built for business.",
|
| 284 |
+
"url": "https://chromeenterprise.google"
|
| 285 |
+
},
|
| 286 |
+
{
|
| 287 |
+
"text": "Facebook",
|
| 288 |
+
"url": "https://www.facebook.com/SASsoftware"
|
| 289 |
+
},
|
| 290 |
+
{
|
| 291 |
+
"text": "Learn more about this provider",
|
| 292 |
+
"url": "https://www.adobe.com/privacy.html"
|
| 293 |
+
},
|
| 294 |
+
{
|
| 295 |
+
"text": "Sign In",
|
| 296 |
+
"url": "javascript:handleLogin('en_ie');"
|
| 297 |
+
},
|
| 298 |
+
{
|
| 299 |
+
"text": "",
|
| 300 |
+
"url": "https://www.youtube.com/geeksforgeeksvideos"
|
| 301 |
+
},
|
| 302 |
+
{
|
| 303 |
+
"text": "Learn more about this provider",
|
| 304 |
+
"url": "https://www.amazon.com/gp/help/customer/display.html/ref=footer_privacy?ie=UTF8&nodeId=468496"
|
| 305 |
+
},
|
| 306 |
+
{
|
| 307 |
+
"text": "Privacy policy",
|
| 308 |
+
"url": "https://www.elsevier.com/legal/privacy-policy"
|
| 309 |
+
},
|
| 310 |
+
{
|
| 311 |
+
"text": "hi@addepto.com",
|
| 312 |
+
"url": "mailto:hi@addepto.com"
|
| 313 |
+
},
|
| 314 |
+
{
|
| 315 |
+
"text": "",
|
| 316 |
+
"url": "https://www.facebook.com/turingcom"
|
| 317 |
+
},
|
| 318 |
+
{
|
| 319 |
+
"text": "Learn more about this provider",
|
| 320 |
+
"url": "https://documents.marketo.com/legal/cookies/"
|
| 321 |
+
},
|
| 322 |
+
{
|
| 323 |
+
"text": "Terms and conditions",
|
| 324 |
+
"url": "https://www.elsevier.com/legal/elsevier-website-terms-and-conditions"
|
| 325 |
+
},
|
| 326 |
+
{
|
| 327 |
+
"text": "https://openai.com/blog/instruction-following/",
|
| 328 |
+
"url": "https://openai.com/blog/instruction-following/"
|
| 329 |
+
},
|
| 330 |
+
{
|
| 331 |
+
"text": "",
|
| 332 |
+
"url": "https://t.me/simplilearnupdates"
|
| 333 |
+
},
|
| 334 |
+
{
|
| 335 |
+
"text": "",
|
| 336 |
+
"url": "https://www.cookiebot.com/en/what-is-behind-powered-by-cookiebot/"
|
| 337 |
+
},
|
| 338 |
+
{
|
| 339 |
+
"text": "LinkedIn",
|
| 340 |
+
"url": "https://www.linkedin.com/company/deepsense-ai/"
|
| 341 |
+
},
|
| 342 |
+
{
|
| 343 |
+
"text": "Learn more about this provider",
|
| 344 |
+
"url": "https://www.openx.com/legal/privacy-policy/"
|
| 345 |
+
},
|
| 346 |
+
{
|
| 347 |
+
"text": "",
|
| 348 |
+
"url": "https://web.telegram.org/#/im?p=@simplilearnupdates"
|
| 349 |
+
},
|
| 350 |
+
{
|
| 351 |
+
"text": "ContextClue",
|
| 352 |
+
"url": "https://context-clue.com/"
|
| 353 |
+
},
|
| 354 |
+
{
|
| 355 |
+
"text": "Facebook",
|
| 356 |
+
"url": "https://www.facebook.com/sharer/sharer.php?u=https%3A%2F%2Faddepto.com%2Fblog%2Fdeep-learning-architecture%2F"
|
| 357 |
+
},
|
| 358 |
+
{
|
| 359 |
+
"text": "Learn more about this provider",
|
| 360 |
+
"url": "https://soundcloud.com/pages/privacy"
|
| 361 |
+
},
|
| 362 |
+
{
|
| 363 |
+
"text": "About Google",
|
| 364 |
+
"url": "https://about.google"
|
| 365 |
+
},
|
| 366 |
+
{
|
| 367 |
+
"text": "Google Cloud on YouTube",
|
| 368 |
+
"url": "https://www.youtube.com/googlecloud"
|
| 369 |
+
},
|
| 370 |
+
{
|
| 371 |
+
"text": "About ScienceDirect",
|
| 372 |
+
"url": "https://www.elsevier.com/solutions/sciencedirect"
|
| 373 |
+
},
|
| 374 |
+
{
|
| 375 |
+
"text": "Follow on X",
|
| 376 |
+
"url": "https://x.com/googlecloud"
|
| 377 |
+
},
|
| 378 |
+
{
|
| 379 |
+
"text": "Shopping cart",
|
| 380 |
+
"url": "https://sd-cart.elsevier.com/"
|
| 381 |
+
},
|
| 382 |
+
{
|
| 383 |
+
"text": "",
|
| 384 |
+
"url": "https://www.instagram.com/simplilearn_official/"
|
| 385 |
+
},
|
| 386 |
+
{
|
| 387 |
+
"text": "Learn more about this provider",
|
| 388 |
+
"url": "https://twitter.com/en/privacy"
|
| 389 |
+
},
|
| 390 |
+
{
|
| 391 |
+
"text": "Learn more about this provider",
|
| 392 |
+
"url": "https://www.cookiebot.com/goto/privacy-policy/"
|
| 393 |
+
},
|
| 394 |
+
{
|
| 395 |
+
"text": "",
|
| 396 |
+
"url": "https://www.relx.com/"
|
| 397 |
+
},
|
| 398 |
+
{
|
| 399 |
+
"text": "",
|
| 400 |
+
"url": "https://x.com/deepsense_ai"
|
| 401 |
+
},
|
| 402 |
+
{
|
| 403 |
+
"text": "Add to Mendeley",
|
| 404 |
+
"url": "https://www.mendeley.com/reference-management/web-importer?utm_source=science_direct&utm_campaign=web_importer_cross_sell"
|
| 405 |
+
},
|
| 406 |
+
{
|
| 407 |
+
"text": "KnativeComponents to create Kubernetes-native cloud-based software.",
|
| 408 |
+
"url": "https://knative.dev/docs/"
|
| 409 |
+
},
|
| 410 |
+
{
|
| 411 |
+
"text": "Twitter",
|
| 412 |
+
"url": "https://twitter.com/SASsoftware"
|
| 413 |
+
},
|
| 414 |
+
{
|
| 415 |
+
"text": "",
|
| 416 |
+
"url": "https://www.youtube.com/user/Simplilearn"
|
| 417 |
+
},
|
| 418 |
+
{
|
| 419 |
+
"text": "",
|
| 420 |
+
"url": "https://www.youtube.com/channel/UCLfxdtB3H0JdnniBnd4DoNA"
|
| 421 |
+
},
|
| 422 |
+
{
|
| 423 |
+
"text": "Facebook",
|
| 424 |
+
"url": "https://www.facebook.com/sharer/sharer.php?u=https://deepsense.ai/blog/using-reinforcement-learning-to-improve-large-language-models/&nb=1"
|
| 425 |
+
},
|
| 426 |
+
{
|
| 427 |
+
"text": "Learn more about this provider",
|
| 428 |
+
"url": "https://www.hotjar.com/legal/policies/privacy/"
|
| 429 |
+
},
|
| 430 |
+
{
|
| 431 |
+
"text": "LinkedIn",
|
| 432 |
+
"url": "https://www.linkedin.com/sharing/share-offsite/?url=https://deepsense.ai/blog/using-reinforcement-learning-to-improve-large-language-models/&nb=1"
|
| 433 |
+
},
|
| 434 |
+
{
|
| 435 |
+
"text": "Twitter",
|
| 436 |
+
"url": "https://twitter.com/intent/tweet?text=Deep+Learning+Architecture+Examples&url=https%3A%2F%2Faddepto.com%2Fblog%2Fdeep-learning-architecture%2F"
|
| 437 |
+
},
|
| 438 |
+
{
|
| 439 |
+
"text": "Merchandise",
|
| 440 |
+
"url": "https://www.sascompanystore.com/"
|
| 441 |
+
},
|
| 442 |
+
{
|
| 443 |
+
"text": "Chrome Enterprise PremiumGet secure enterprise browsing with extensive endpoint visibility.",
|
| 444 |
+
"url": "https://chromeenterprise.google/products/chrome-enterprise-premium/"
|
| 445 |
+
},
|
| 446 |
+
{
|
| 447 |
+
"text": "Careers",
|
| 448 |
+
"url": "https://geeksforgeeks.zohorecruit.in/careers?ref=footer"
|
| 449 |
+
},
|
| 450 |
+
{
|
| 451 |
+
"text": "Subscribe to newsletter",
|
| 452 |
+
"url": "https://addepto.us20.list-manage.com/subscribe?u=06ea2fe321b296590739d716f&id=0b963312a3"
|
| 453 |
+
},
|
| 454 |
+
{
|
| 455 |
+
"text": "LinkedIn",
|
| 456 |
+
"url": "https://www.linkedin.com/shareArticle?mini=true&url=https%3A%2F%2Faddepto.com%2Fblog%2Fdeep-learning-architecture%2F&title=Deep+Learning+Architecture+Examples"
|
| 457 |
+
},
|
| 458 |
+
{
|
| 459 |
+
"text": "",
|
| 460 |
+
"url": "https://twitter.com/turingcom"
|
| 461 |
+
},
|
| 462 |
+
{
|
| 463 |
+
"text": "Medium",
|
| 464 |
+
"url": "https://medium.com/deepsense-ai"
|
| 465 |
+
},
|
| 466 |
+
{
|
| 467 |
+
"text": "Google Cloud Tech on YouTube",
|
| 468 |
+
"url": "https://www.youtube.com/googlecloudplatform"
|
| 469 |
+
},
|
| 470 |
+
{
|
| 471 |
+
"text": "X",
|
| 472 |
+
"url": "https://twitter.com/intent/tweet?text=https://deepsense.ai/blog/using-reinforcement-learning-to-improve-large-language-models/&nb=1"
|
| 473 |
+
},
|
| 474 |
+
{
|
| 475 |
+
"text": "https://huggingface.co/blog/rlhf",
|
| 476 |
+
"url": "https://huggingface.co/blog/rlhf"
|
| 477 |
+
},
|
| 478 |
+
{
|
| 479 |
+
"text": "https://wandb.ai/ayush-thakur/RLHF/reports/Understanding-Reinforcement-Learning-from-Human-Feedback-RLHF-Part-1\u2013VmlldzoyODk5MTIx",
|
| 480 |
+
"url": "https://wandb.ai/ayush-thakur/RLHF/reports/Understanding-Reinforcement-Learning-from-Human-Feedback-RLHF-Part-1--VmlldzoyODk5MTIx"
|
| 481 |
+
},
|
| 482 |
+
{
|
| 483 |
+
"text": "X",
|
| 484 |
+
"url": "https://twitter.com/deepsense_ai"
|
| 485 |
+
},
|
| 486 |
+
{
|
| 487 |
+
"text": "Facebook",
|
| 488 |
+
"url": "http://www.facebook.com/sharer.php?u=https%3A%2F%2Fwww.sas.com%2Fen_ie%2Finsights%2Farticles%2Fanalytics%2Fmachine-learning-algorithms.html"
|
| 489 |
+
},
|
| 490 |
+
{
|
| 491 |
+
"text": "",
|
| 492 |
+
"url": "mailto:?subject=Semi-Supervised%20Learning%20in%20Artificial%20Intelligence&body=Hello,%0A %0A I found this at datarobot.com and thought you might be interested. Take a look here: https%3A%2F%2Fwww.datarobot.com%2Fblog%2Fsemi-supervised-learning%2F%0A %0A Thank you."
|
| 493 |
+
},
|
| 494 |
+
{
|
| 495 |
+
"text": "",
|
| 496 |
+
"url": "https://www.linkedin.com/shareArticle?mini=true&url=https%3A%2F%2Fwww.datarobot.com%2Fblog%2Fsemi-supervised-learning%2F"
|
| 497 |
+
},
|
| 498 |
+
{
|
| 499 |
+
"text": "Facebook",
|
| 500 |
+
"url": "https://www.facebook.com/deepsenseai"
|
| 501 |
+
},
|
| 502 |
+
{
|
| 503 |
+
"text": "Learn more about this provider",
|
| 504 |
+
"url": "https://business.safety.google/privacy/"
|
| 505 |
+
},
|
| 506 |
+
{
|
| 507 |
+
"text": "LinkedIn",
|
| 508 |
+
"url": "https://www.linkedin.com/sharing/share-offsite/?url=https%3A%2F%2Fwww.sas.com%2Fen_ie%2Finsights%2Farticles%2Fanalytics%2Fmachine-learning-algorithms.html"
|
| 509 |
+
},
|
| 510 |
+
{
|
| 511 |
+
"text": "",
|
| 512 |
+
"url": "https://www.facebook.com/sharer/sharer.php?u=https%3A%2F%2Fwww.datarobot.com%2Fblog%2Fsemi-supervised-learning%2F"
|
| 513 |
+
},
|
| 514 |
+
{
|
| 515 |
+
"text": "video",
|
| 516 |
+
"url": "https://www.youtube.com/watch?v=oC7Cw3fu3gU"
|
| 517 |
+
},
|
| 518 |
+
{
|
| 519 |
+
"text": "Search Jobs",
|
| 520 |
+
"url": "https://globalcareers-sas.icims.com/jobs/intro"
|
| 521 |
+
},
|
| 522 |
+
{
|
| 523 |
+
"text": "",
|
| 524 |
+
"url": "https://geeksforgeeksapp.page.link/gfg-app"
|
| 525 |
+
},
|
| 526 |
+
{
|
| 527 |
+
"text": "",
|
| 528 |
+
"url": "https://github.com/Addepto/contextcheck"
|
| 529 |
+
},
|
| 530 |
+
{
|
| 531 |
+
"text": "Advertise",
|
| 532 |
+
"url": "https://www.elsmediakits.com"
|
| 533 |
+
},
|
| 534 |
+
{
|
| 535 |
+
"text": "Evaluate Your RAG-Powered Chatbots",
|
| 536 |
+
"url": "https://context-clue.com/contextcheck/"
|
| 537 |
+
},
|
| 538 |
+
{
|
| 539 |
+
"text": "Learn more about this provider",
|
| 540 |
+
"url": "https://www.facebook.com/policy.php/"
|
| 541 |
+
},
|
| 542 |
+
{
|
| 543 |
+
"text": "",
|
| 544 |
+
"url": "https://twitter.com/intent/tweet?url=https%3A%2F%2Fwww.datarobot.com%2Fblog%2Fsemi-supervised-learning%2F&text="
|
| 545 |
+
},
|
| 546 |
+
{
|
| 547 |
+
"text": "YouTube",
|
| 548 |
+
"url": "https://www.youtube.com/@deepsenseai"
|
| 549 |
+
},
|
| 550 |
+
{
|
| 551 |
+
"text": "Follow",
|
| 552 |
+
"url": "https://news.google.com/publications/CAAqBwgKMLTrzwsw44bnAw?hl=en-IN&gl=IN&ceid=IN%3Aen"
|
| 553 |
+
},
|
| 554 |
+
{
|
| 555 |
+
"text": "",
|
| 556 |
+
"url": "https://www.facebook.com/geeksforgeeks.org/"
|
| 557 |
+
},
|
| 558 |
+
{
|
| 559 |
+
"text": "Learn more about this provider",
|
| 560 |
+
"url": "https://vwo.com/privacy-policy/"
|
| 561 |
+
},
|
| 562 |
+
{
|
| 563 |
+
"text": "Learn more about this provider",
|
| 564 |
+
"url": "https://www.zoho.com/privacy.html"
|
| 565 |
+
},
|
| 566 |
+
{
|
| 567 |
+
"text": "Get the iOS App",
|
| 568 |
+
"url": "https://apps.apple.com/app/simplilearn/id963042747?ls=1"
|
| 569 |
+
},
|
| 570 |
+
{
|
| 571 |
+
"text": "",
|
| 572 |
+
"url": "https://www.linkedin.com/company/turingcom"
|
| 573 |
+
}
|
| 574 |
+
],
|
| 575 |
+
"references": []
|
| 576 |
+
},
|
| 577 |
+
"research_tree": {
|
| 578 |
+
"query": "what are the different types of deep learning models",
|
| 579 |
+
"depth": 0,
|
| 580 |
+
"children": [
|
| 581 |
+
{
|
| 582 |
+
"query": "deep learning model architectures comparison",
|
| 583 |
+
"depth": 1,
|
| 584 |
+
"children": [
|
| 585 |
+
{
|
| 586 |
+
"query": "Transformer models in NLP: advancements and applications",
|
| 587 |
+
"depth": 2,
|
| 588 |
+
"children": []
|
| 589 |
+
},
|
| 590 |
+
{
|
| 591 |
+
"query": "Graph Neural Networks (GNNs) for social network analysis",
|
| 592 |
+
"depth": 2,
|
| 593 |
+
"children": []
|
| 594 |
+
},
|
| 595 |
+
{
|
| 596 |
+
"query": "Deep Reinforcement Learning with DQNs: challenges and solutions",
|
| 597 |
+
"depth": 2,
|
| 598 |
+
"children": []
|
| 599 |
+
}
|
| 600 |
+
]
|
| 601 |
+
},
|
| 602 |
+
{
|
| 603 |
+
"query": "applications of semi-supervised learning in deep learning",
|
| 604 |
+
"depth": 1,
|
| 605 |
+
"children": [
|
| 606 |
+
{
|
| 607 |
+
"query": "Transformer models advancements in NLP and computer vision",
|
| 608 |
+
"depth": 2,
|
| 609 |
+
"children": []
|
| 610 |
+
},
|
| 611 |
+
{
|
| 612 |
+
"query": "Applications of Graph Neural Networks (GNNs) in various domains",
|
| 613 |
+
"depth": 2,
|
| 614 |
+
"children": []
|
| 615 |
+
},
|
| 616 |
+
{
|
| 617 |
+
"query": "Semi-supervised learning techniques for handling imbalanced datasets",
|
| 618 |
+
"depth": 2,
|
| 619 |
+
"children": []
|
| 620 |
+
}
|
| 621 |
+
]
|
| 622 |
+
},
|
| 623 |
+
{
|
| 624 |
+
"query": "advanced reinforcement learning algorithms and implementations",
|
| 625 |
+
"depth": 1,
|
| 626 |
"children": [
|
| 627 |
+
{
|
| 628 |
+
"query": "deep learning model architectures comparison RNN LSTM GRU CNN Transformer GAN",
|
| 629 |
+
"depth": 2,
|
| 630 |
+
"children": []
|
| 631 |
+
},
|
| 632 |
+
{
|
| 633 |
+
"query": "reinforcement learning for large language models RLHF PPO algorithm challenges",
|
| 634 |
+
"depth": 2,
|
| 635 |
+
"children": []
|
| 636 |
+
},
|
| 637 |
+
{
|
| 638 |
+
"query": "semi-supervised learning applications text classification clustering examples",
|
| 639 |
+
"depth": 2,
|
| 640 |
+
"children": []
|
| 641 |
+
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 642 |
]
|
| 643 |
+
}
|
| 644 |
+
]
|
| 645 |
+
},
|
| 646 |
+
"metadata": {
|
| 647 |
+
"total_queries": 12,
|
| 648 |
+
"total_sources": 10,
|
| 649 |
+
"max_depth_reached": 2,
|
| 650 |
+
"total_tokens": 573199
|
| 651 |
+
}
|
| 652 |
}
|
backend/research_node.py
CHANGED
|
@@ -8,9 +8,6 @@ class ResearchNode:
|
|
| 8 |
self.depth = depth
|
| 9 |
self.children: List[ResearchNode] = []
|
| 10 |
self.data: List[Dict[str, Any]] = []
|
| 11 |
-
self.explored = False
|
| 12 |
-
self.importance_score = 0.0
|
| 13 |
-
self.timestamp = datetime.now()
|
| 14 |
|
| 15 |
def add_child(self, query: str) -> 'ResearchNode':
|
| 16 |
child = ResearchNode(query, parent=self, depth=self.depth + 1)
|
|
@@ -24,7 +21,7 @@ class ResearchNode:
|
|
| 24 |
current = current.parent
|
| 25 |
path.append(current.query)
|
| 26 |
return list(reversed(path))
|
| 27 |
-
|
| 28 |
def max_depth(self) -> int:
|
| 29 |
if not self.children:
|
| 30 |
return self.depth
|
|
@@ -34,3 +31,9 @@ class ResearchNode:
|
|
| 34 |
if not self.children:
|
| 35 |
return 0
|
| 36 |
return len(self.children) + sum([child.total_children() for child in self.children])
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 8 |
self.depth = depth
|
| 9 |
self.children: List[ResearchNode] = []
|
| 10 |
self.data: List[Dict[str, Any]] = []
|
|
|
|
|
|
|
|
|
|
| 11 |
|
| 12 |
def add_child(self, query: str) -> 'ResearchNode':
|
| 13 |
child = ResearchNode(query, parent=self, depth=self.depth + 1)
|
|
|
|
| 21 |
current = current.parent
|
| 22 |
path.append(current.query)
|
| 23 |
return list(reversed(path))
|
| 24 |
+
|
| 25 |
def max_depth(self) -> int:
|
| 26 |
if not self.children:
|
| 27 |
return self.depth
|
|
|
|
| 31 |
if not self.children:
|
| 32 |
return 0
|
| 33 |
return len(self.children) + sum([child.total_children() for child in self.children])
|
| 34 |
+
|
| 35 |
+
def get_all_data(self) -> List[Dict[str, Any]]:
|
| 36 |
+
data = self.data
|
| 37 |
+
for child in self.children:
|
| 38 |
+
data.extend(child.get_all_data())
|
| 39 |
+
return data
|
backend/scraper.py
CHANGED
|
@@ -8,6 +8,7 @@ from crawl4ai import AsyncWebCrawler, BrowserConfig, CacheMode
|
|
| 8 |
import newspaper
|
| 9 |
from newspaper import Article
|
| 10 |
import requests
|
|
|
|
| 11 |
|
| 12 |
|
| 13 |
class WebScraper:
|
|
@@ -170,6 +171,7 @@ class CrawlForAIScraper:
|
|
| 170 |
async def start(self):
|
| 171 |
if not self._is_started:
|
| 172 |
await self.crawler.start()
|
|
|
|
| 173 |
self._is_started = True
|
| 174 |
|
| 175 |
async def close(self):
|
|
@@ -178,12 +180,14 @@ class CrawlForAIScraper:
|
|
| 178 |
self._is_started = False
|
| 179 |
|
| 180 |
async def search_and_scrape(self, query: str, num_sites: int = 3) -> List[Dict[str, Any]]:
|
| 181 |
-
|
| 182 |
-
await self.start()
|
| 183 |
self.logger.info(f"Starting search for: {query}")
|
| 184 |
-
|
|
|
|
|
|
|
| 185 |
self.logger.info(f"Found {len(search_results)} search results")
|
| 186 |
|
|
|
|
| 187 |
scraped_data = []
|
| 188 |
for idx, url in enumerate(search_results):
|
| 189 |
try:
|
|
@@ -196,59 +200,110 @@ class CrawlForAIScraper:
|
|
| 196 |
self.logger.error(f"Error scraping {url}: {str(e)}")
|
| 197 |
continue
|
| 198 |
|
| 199 |
-
await self.crawler.close()
|
| 200 |
self.logger.info(f"Completed scraping {len(scraped_data)} sites")
|
| 201 |
return scraped_data
|
| 202 |
|
| 203 |
async def _google_search(self, query: str, num_results: int) -> List[str]:
|
| 204 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 205 |
|
| 206 |
async def _scrape_page(self, url: str) -> Dict[str, Any]:
|
| 207 |
-
|
| 208 |
-
await self.start()
|
| 209 |
|
| 210 |
try:
|
| 211 |
# Run the crawler on a URL
|
| 212 |
-
result = await self.crawler.arun(url=url, screenshot=False, cache_mode=CacheMode.BYPASS)
|
| 213 |
soup = BeautifulSoup(result.html, "html.parser")
|
| 214 |
data = {
|
| 215 |
"url": url,
|
| 216 |
"text": result.markdown,
|
| 217 |
-
"images": self._extract_images(soup),
|
| 218 |
-
"videos":
|
| 219 |
-
"links": result.links,
|
| 220 |
}
|
| 221 |
|
| 222 |
return data
|
| 223 |
|
| 224 |
except Exception as e:
|
| 225 |
-
|
| 226 |
-
raise e
|
| 227 |
return {}
|
| 228 |
|
| 229 |
-
def
|
| 230 |
-
|
| 231 |
-
|
| 232 |
-
|
| 233 |
-
|
| 234 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 235 |
return images
|
| 236 |
|
| 237 |
def _extract_videos(self, soup: BeautifulSoup) -> List[str]:
|
| 238 |
-
|
| 239 |
-
|
| 240 |
-
|
| 241 |
-
|
| 242 |
-
|
| 243 |
-
|
| 244 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 245 |
|
| 246 |
|
| 247 |
if __name__ == "__main__":
|
|
|
|
|
|
|
|
|
|
|
|
|
| 248 |
async def main():
|
| 249 |
scraper = CrawlForAIScraper()
|
| 250 |
await scraper.start()
|
| 251 |
-
data = await scraper.
|
| 252 |
await scraper.close()
|
|
|
|
|
|
|
| 253 |
print(json.dumps(data, indent=2))
|
| 254 |
asyncio.run(main())
|
|
|
|
| 8 |
import newspaper
|
| 9 |
from newspaper import Article
|
| 10 |
import requests
|
| 11 |
+
import time
|
| 12 |
|
| 13 |
|
| 14 |
class WebScraper:
|
|
|
|
| 171 |
async def start(self):
|
| 172 |
if not self._is_started:
|
| 173 |
await self.crawler.start()
|
| 174 |
+
time.sleep(1)
|
| 175 |
self._is_started = True
|
| 176 |
|
| 177 |
async def close(self):
|
|
|
|
| 180 |
self._is_started = False
|
| 181 |
|
| 182 |
async def search_and_scrape(self, query: str, num_sites: int = 3) -> List[Dict[str, Any]]:
|
| 183 |
+
await self.start()
|
|
|
|
| 184 |
self.logger.info(f"Starting search for: {query}")
|
| 185 |
+
|
| 186 |
+
# Perform a Google search to get a list of webpages
|
| 187 |
+
search_results = await self._google_search(query, num_sites)
|
| 188 |
self.logger.info(f"Found {len(search_results)} search results")
|
| 189 |
|
| 190 |
+
# Scrape each webpage
|
| 191 |
scraped_data = []
|
| 192 |
for idx, url in enumerate(search_results):
|
| 193 |
try:
|
|
|
|
| 200 |
self.logger.error(f"Error scraping {url}: {str(e)}")
|
| 201 |
continue
|
| 202 |
|
|
|
|
| 203 |
self.logger.info(f"Completed scraping {len(scraped_data)} sites")
|
| 204 |
return scraped_data
|
| 205 |
|
| 206 |
async def _google_search(self, query: str, num_results: int) -> List[str]:
|
| 207 |
+
self.logger.info("Performing Google search...")
|
| 208 |
+
try:
|
| 209 |
+
encoded_query = quote_plus(query)
|
| 210 |
+
search_uri = f"https://www.google.com/search?q={encoded_query}"
|
| 211 |
+
|
| 212 |
+
result = await self.crawler.arun(url=search_uri, screenshot=False, cache_mode=CacheMode.BYPASS, delay_before_return_html=2, page_timeout=25000, scan_full_page=True)
|
| 213 |
+
|
| 214 |
+
soup = BeautifulSoup(result.html, "html.parser")
|
| 215 |
+
search_results = []
|
| 216 |
+
|
| 217 |
+
for link in list(soup.select("div > span > a"))[2:]:
|
| 218 |
+
url = link.get("href").replace(" ", "").replace("\n", "").strip()
|
| 219 |
+
if not url.startswith(("http://", "https://")):
|
| 220 |
+
url = "https://" + url
|
| 221 |
+
search_results.append(url)
|
| 222 |
+
if len(search_results) >= num_results:
|
| 223 |
+
break
|
| 224 |
+
|
| 225 |
+
self.logger.info(f"Found {len(search_results)} URLs")
|
| 226 |
+
return search_results
|
| 227 |
+
|
| 228 |
+
except requests.exceptions.RequestException as e:
|
| 229 |
+
self.logger.error(f"Google search error: {str(e)}")
|
| 230 |
+
return []
|
| 231 |
+
except Exception as e:
|
| 232 |
+
self.logger.error(f"Google search error: {str(e)}")
|
| 233 |
+
return []
|
| 234 |
|
| 235 |
async def _scrape_page(self, url: str) -> Dict[str, Any]:
|
| 236 |
+
await self.start()
|
|
|
|
| 237 |
|
| 238 |
try:
|
| 239 |
# Run the crawler on a URL
|
| 240 |
+
result = await self.crawler.arun(url=url, screenshot=False, cache_mode=CacheMode.BYPASS, delay_before_return_html=2, page_timeout=25000, scan_full_page=True)
|
| 241 |
soup = BeautifulSoup(result.html, "html.parser")
|
| 242 |
data = {
|
| 243 |
"url": url,
|
| 244 |
"text": result.markdown,
|
| 245 |
+
"images": self._extract_images(soup, result.url),
|
| 246 |
+
"videos": self._extract_videos(soup),
|
| 247 |
+
"links": result.links["external"],
|
| 248 |
}
|
| 249 |
|
| 250 |
return data
|
| 251 |
|
| 252 |
except Exception as e:
|
| 253 |
+
self.logger.error(f"Scraping error for {url}: {str(e)}")
|
| 254 |
+
# raise e
|
| 255 |
return {}
|
| 256 |
|
| 257 |
+
def _extract_images(self, soup: BeautifulSoup, url: str) -> List[str]:
|
| 258 |
+
# Extract images with width and height greater than 300 pixels
|
| 259 |
+
images = []
|
| 260 |
+
for img in soup.find_all('img'):
|
| 261 |
+
if 'src' in img.attrs:
|
| 262 |
+
src = img['src']
|
| 263 |
+
# remove px or any characters from width and height
|
| 264 |
+
width = int(''.join(filter(str.isdigit, img.get('width', '0'))))
|
| 265 |
+
height = int(''.join(filter(str.isdigit, img.get('height', '0'))))
|
| 266 |
+
if width > 300 and height > 300 and 'pixel' not in src and 'icon' not in src:
|
| 267 |
+
images.append((src, width, height))
|
| 268 |
+
images = sorted(images, key=lambda img: -1 * (img[1] * img[2]))
|
| 269 |
+
images = [img[0] for img in images]
|
| 270 |
+
|
| 271 |
+
# Add base URL to relative URLs
|
| 272 |
+
base_url = '/'.join(url.split('/')[:3])
|
| 273 |
+
images = [img if img.startswith('http') else base_url + img for img in images]
|
| 274 |
return images
|
| 275 |
|
| 276 |
def _extract_videos(self, soup: BeautifulSoup) -> List[str]:
|
| 277 |
+
# Extract videos from iframes and video tags
|
| 278 |
+
videos = []
|
| 279 |
+
nodes = list(soup.find_all('iframe')) + list(soup.find_all('video')) + list(soup.find_all('a'))
|
| 280 |
+
for node in nodes:
|
| 281 |
+
if node.name == 'iframe':
|
| 282 |
+
src = node.get('src', '')
|
| 283 |
+
if 'youtube.com' in src or 'youtu.be' in src:
|
| 284 |
+
videos.append(src)
|
| 285 |
+
elif node.name == 'video':
|
| 286 |
+
src = node.get('src', '')
|
| 287 |
+
if 'youtube.com' in src or 'youtu.be' in src:
|
| 288 |
+
videos.append(src)
|
| 289 |
+
elif node.name == 'a':
|
| 290 |
+
href = node.get('href', '')
|
| 291 |
+
if 'youtube.com' in href or 'youtu.be' in href:
|
| 292 |
+
videos.append(href)
|
| 293 |
+
return videos
|
| 294 |
|
| 295 |
|
| 296 |
if __name__ == "__main__":
|
| 297 |
+
import sys
|
| 298 |
+
url = "https://docs.anthropic.com/en/docs/agents-and-tools/claude-code/overview"
|
| 299 |
+
if len(sys.argv) > 1:
|
| 300 |
+
url = sys.argv[1]
|
| 301 |
async def main():
|
| 302 |
scraper = CrawlForAIScraper()
|
| 303 |
await scraper.start()
|
| 304 |
+
data = await scraper.search_and_scrape("what is ai")
|
| 305 |
await scraper.close()
|
| 306 |
+
with open("output.json", "w") as f:
|
| 307 |
+
f.write(json.dumps(data, indent=2))
|
| 308 |
print(json.dumps(data, indent=2))
|
| 309 |
asyncio.run(main())
|