import requests from bs4 import BeautifulSoup import re import time from typing import List, Dict HEADERS = { "User-Agent": "Mozilla/5.0 (MVI-AI Knowledge Engine)" } SOURCES = { "reddit": "https://www.reddit.com/search/?q={query}", "wikipedia": "https://en.wikipedia.org/wiki/{query}", "stack_overflow": "https://stackoverflow.com/search?q={query}", "medium": "https://medium.com/search?q={query}" } def clean_text(text: str) -> str: text = re.sub(r'\s+', ' ', text) text = re.sub(r'http\S+', '', text) text = text.strip() return text def extract_sentences(text: str, max_len: int = 200) -> List[str]: sentences = re.split(r'[.!?]', text) cleaned = [] for s in sentences: s = clean_text(s) if len(s) > 30 and len(s) < max_len: cleaned.append(s) return cleaned def scrape_page(url: str) -> str: try: r = requests.get( url, headers=HEADERS, timeout=6 ) if r.status_code != 200: return "" soup = BeautifulSoup(r.text, "html.parser") for tag in soup(["script", "style", "noscript"]): tag.decompose() text = soup.get_text(" ") return text except Exception: return "" def scrape_social_knowledge(query: str, limit: int = 30) -> List[Dict]: knowledge = [] for source_name, url in SOURCES.items(): try: full_url = url.format( query=query.replace(" ", "+") ) page_text = scrape_page(full_url) sentences = extract_sentences(page_text) for s in sentences[:limit]: knowledge.append({ "query": query, "source": source_name, "url": full_url, "text": s, "timestamp": time.time() }) except Exception: continue return knowledge