Agnuxo commited on
Commit
aa5a2db
·
verified ·
1 Parent(s): 433143b

Upload research/web_scanner.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. research/web_scanner.py +126 -0
research/web_scanner.py ADDED
@@ -0,0 +1,126 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Web Research Scanner
3
+ =====================
4
+ Scans free sources for related research and collaboration opportunities.
5
+ """
6
+ import json
7
+ import logging
8
+ import urllib.request
9
+ import urllib.parse
10
+ from typing import Optional
11
+ from datetime import datetime
12
+
13
+ logger = logging.getLogger("openclaw.webscan")
14
+
15
+
16
+ class WebResearchScanner:
17
+ """Scan public APIs for research updates."""
18
+
19
+ def search_arxiv_related(self, topics: list[str], max_results: int = 10) -> list[dict]:
20
+ """Search ArXiv for papers related to our research topics."""
21
+ papers = []
22
+
23
+ for topic in topics[:3]: # Limit to avoid rate limits
24
+ try:
25
+ query = urllib.parse.urlencode({
26
+ "search_query": f'all:"{topic}"',
27
+ "start": 0,
28
+ "max_results": max_results,
29
+ "sortBy": "submittedDate",
30
+ "sortOrder": "descending"
31
+ })
32
+ url = f"http://export.arxiv.org/api/query?{query}"
33
+ req = urllib.request.Request(url, headers={"User-Agent": "OpenCLAW-Agent/1.0"})
34
+
35
+ with urllib.request.urlopen(req, timeout=30) as resp:
36
+ import xml.etree.ElementTree as ET
37
+ data = resp.read().decode()
38
+ root = ET.fromstring(data)
39
+ ns = {"atom": "http://www.w3.org/2005/Atom"}
40
+
41
+ for entry in root.findall("atom:entry", ns):
42
+ title = entry.find("atom:title", ns).text.strip().replace("\n", " ")
43
+ authors = [a.find("atom:name", ns).text for a in entry.findall("atom:author", ns)]
44
+
45
+ paper_url = ""
46
+ for link in entry.findall("atom:link", ns):
47
+ if "abs" in link.get("href", ""):
48
+ paper_url = link.get("href")
49
+
50
+ papers.append({
51
+ "title": title,
52
+ "authors": authors[:3],
53
+ "url": paper_url,
54
+ "topic": topic,
55
+ })
56
+
57
+ except Exception as e:
58
+ logger.warning(f"ArXiv search for '{topic}' failed: {e}")
59
+
60
+ return papers
61
+
62
+ def search_semantic_scholar(self, query: str, limit: int = 5) -> list[dict]:
63
+ """Search Semantic Scholar API (free, no key needed)."""
64
+ papers = []
65
+ try:
66
+ encoded = urllib.parse.quote(query)
67
+ url = f"https://api.semanticscholar.org/graph/v1/paper/search?query={encoded}&limit={limit}&fields=title,authors,url,year"
68
+
69
+ req = urllib.request.Request(url, headers={"User-Agent": "OpenCLAW-Agent/1.0"})
70
+ with urllib.request.urlopen(req, timeout=15) as resp:
71
+ data = json.loads(resp.read().decode())
72
+
73
+ for p in data.get("data", []):
74
+ papers.append({
75
+ "title": p.get("title", ""),
76
+ "authors": [a.get("name", "") for a in p.get("authors", [])[:3]],
77
+ "url": p.get("url", ""),
78
+ "year": p.get("year"),
79
+ })
80
+ except Exception as e:
81
+ logger.warning(f"Semantic Scholar search failed: {e}")
82
+
83
+ return papers
84
+
85
+ def search_hf_models(self, query: str, limit: int = 5) -> list[dict]:
86
+ """Search Hugging Face for relevant models."""
87
+ models = []
88
+ try:
89
+ encoded = urllib.parse.quote(query)
90
+ url = f"https://huggingface.co/api/models?search={encoded}&limit={limit}&sort=downloads&direction=-1"
91
+
92
+ req = urllib.request.Request(url, headers={"User-Agent": "OpenCLAW-Agent/1.0"})
93
+ with urllib.request.urlopen(req, timeout=15) as resp:
94
+ data = json.loads(resp.read().decode())
95
+
96
+ for m in data:
97
+ models.append({
98
+ "id": m.get("modelId", ""),
99
+ "downloads": m.get("downloads", 0),
100
+ "likes": m.get("likes", 0),
101
+ "tags": m.get("tags", [])[:5],
102
+ })
103
+ except Exception as e:
104
+ logger.warning(f"HF model search failed: {e}")
105
+
106
+ return models
107
+
108
+ def find_potential_collaborators(self, topics: list[str]) -> list[dict]:
109
+ """Find researchers working on similar topics via Semantic Scholar."""
110
+ collaborators = []
111
+ seen_names = set()
112
+
113
+ for topic in topics[:3]:
114
+ papers = self.search_semantic_scholar(topic, limit=5)
115
+ for p in papers:
116
+ for author in p.get("authors", []):
117
+ name = author if isinstance(author, str) else author.get("name", "")
118
+ if name and name not in seen_names and "Angulo" not in name:
119
+ seen_names.add(name)
120
+ collaborators.append({
121
+ "name": name,
122
+ "paper": p.get("title", ""),
123
+ "topic": topic,
124
+ })
125
+
126
+ return collaborators[:20]