Musombi commited on
Commit
078de65
·
1 Parent(s): d46301b

Update reasoning/scraper.py

Browse files
Files changed (1) hide show
  1. reasoning/scraper.py +38 -13
reasoning/scraper.py CHANGED
@@ -1,10 +1,11 @@
1
  import requests
2
  from bs4 import BeautifulSoup
3
  import re
 
4
  from typing import List, Dict
5
 
6
  HEADERS = {
7
- "User-Agent": "Mozilla/5.0 (AI Knowledge Collector)"
8
  }
9
 
10
  SOURCES = {
@@ -14,19 +15,25 @@ SOURCES = {
14
  "medium": "https://medium.com/search?q={query}"
15
  }
16
 
 
17
  def clean_text(text: str) -> str:
18
  text = re.sub(r'\s+', ' ', text)
19
  text = re.sub(r'http\S+', '', text)
20
- return text.strip()
 
 
21
 
 
22
 
23
- def extract_sentences(text: str, max_len=200) -> List[str]:
24
  sentences = re.split(r'[.!?]', text)
 
25
  cleaned = []
26
 
27
  for s in sentences:
 
28
  s = clean_text(s)
29
- if len(s) > 20 and len(s) < max_len:
 
30
  cleaned.append(s)
31
 
32
  return cleaned
@@ -35,7 +42,12 @@ def extract_sentences(text: str, max_len=200) -> List[str]:
35
  def scrape_page(url: str) -> str:
36
 
37
  try:
38
- r = requests.get(url, headers=HEADERS, timeout=6)
 
 
 
 
 
39
 
40
  if r.status_code != 200:
41
  return ""
@@ -45,31 +57,44 @@ def scrape_page(url: str) -> str:
45
  for tag in soup(["script", "style", "noscript"]):
46
  tag.decompose()
47
 
48
- return soup.get_text(" ")
 
 
49
 
50
  except Exception:
51
  return ""
52
 
53
 
54
- def scrape_social_knowledge(query: str, limit: int = 40) -> List[Dict]:
55
 
56
  knowledge = []
57
 
58
- for name, url in SOURCES.items():
59
 
60
  try:
61
 
62
- full_url = url.format(query=query.replace(" ", "+"))
 
 
63
 
64
- text = scrape_page(full_url)
65
 
66
- sentences = extract_sentences(text)
67
 
68
  for s in sentences[:limit]:
69
 
70
  knowledge.append({
71
- "source": name,
72
- "text": s
 
 
 
 
 
 
 
 
 
73
  })
74
 
75
  except Exception:
 
1
  import requests
2
  from bs4 import BeautifulSoup
3
  import re
4
+ import time
5
  from typing import List, Dict
6
 
7
  HEADERS = {
8
+ "User-Agent": "Mozilla/5.0 (MVI-AI Knowledge Engine)"
9
  }
10
 
11
  SOURCES = {
 
15
  "medium": "https://medium.com/search?q={query}"
16
  }
17
 
18
+
19
  def clean_text(text: str) -> str:
20
  text = re.sub(r'\s+', ' ', text)
21
  text = re.sub(r'http\S+', '', text)
22
+ text = text.strip()
23
+ return text
24
+
25
 
26
+ def extract_sentences(text: str, max_len: int = 200) -> List[str]:
27
 
 
28
  sentences = re.split(r'[.!?]', text)
29
+
30
  cleaned = []
31
 
32
  for s in sentences:
33
+
34
  s = clean_text(s)
35
+
36
+ if len(s) > 30 and len(s) < max_len:
37
  cleaned.append(s)
38
 
39
  return cleaned
 
42
  def scrape_page(url: str) -> str:
43
 
44
  try:
45
+
46
+ r = requests.get(
47
+ url,
48
+ headers=HEADERS,
49
+ timeout=6
50
+ )
51
 
52
  if r.status_code != 200:
53
  return ""
 
57
  for tag in soup(["script", "style", "noscript"]):
58
  tag.decompose()
59
 
60
+ text = soup.get_text(" ")
61
+
62
+ return text
63
 
64
  except Exception:
65
  return ""
66
 
67
 
68
+ def scrape_social_knowledge(query: str, limit: int = 30) -> List[Dict]:
69
 
70
  knowledge = []
71
 
72
+ for source_name, url in SOURCES.items():
73
 
74
  try:
75
 
76
+ full_url = url.format(
77
+ query=query.replace(" ", "+")
78
+ )
79
 
80
+ page_text = scrape_page(full_url)
81
 
82
+ sentences = extract_sentences(page_text)
83
 
84
  for s in sentences[:limit]:
85
 
86
  knowledge.append({
87
+
88
+ "query": query,
89
+
90
+ "source": source_name,
91
+
92
+ "url": full_url,
93
+
94
+ "text": s,
95
+
96
+ "timestamp": time.time()
97
+
98
  })
99
 
100
  except Exception: