Musombi's picture
Update reasoning/scraper.py
e6abddd verified
import requests
from bs4 import BeautifulSoup
import re
import time
from typing import List, Dict
HEADERS = {
"User-Agent": "Mozilla/5.0 (MVI-AI Knowledge Engine)"
}
SOURCES = {
"reddit": "https://www.reddit.com/search/?q={query}",
"wikipedia": "https://en.wikipedia.org/wiki/{query}",
"stack_overflow": "https://stackoverflow.com/search?q={query}",
"medium": "https://medium.com/search?q={query}"
}
def clean_text(text: str) -> str:
text = re.sub(r'\s+', ' ', text)
text = re.sub(r'http\S+', '', text)
text = text.strip()
return text
def extract_sentences(text: str, max_len: int = 200) -> List[str]:
sentences = re.split(r'[.!?]', text)
cleaned = []
for s in sentences:
s = clean_text(s)
if len(s) > 30 and len(s) < max_len:
cleaned.append(s)
return cleaned
def scrape_page(url: str) -> str:
try:
r = requests.get(
url,
headers=HEADERS,
timeout=6
)
if r.status_code != 200:
return ""
soup = BeautifulSoup(r.text, "html.parser")
for tag in soup(["script", "style", "noscript"]):
tag.decompose()
text = soup.get_text(" ")
return text
except Exception:
return ""
def scrape_social_knowledge(query: str, limit: int = 30) -> List[Dict]:
knowledge = []
for source_name, url in SOURCES.items():
try:
full_url = url.format(
query=query.replace(" ", "+")
)
page_text = scrape_page(full_url)
sentences = extract_sentences(page_text)
for s in sentences[:limit]:
knowledge.append({
"query": query,
"source": source_name,
"url": full_url,
"text": s,
"timestamp": time.time()
})
except Exception:
continue
return knowledge