Spaces:

Musombi
/

mvi-ai-engine

Running

Update reasoning/scraper.py

e6abddd verified 6 days ago

2.01 kB

	import requests
	from bs4 import BeautifulSoup
	import re
	import time
	from typing import List, Dict

	HEADERS = {
	"User-Agent": "Mozilla/5.0 (MVI-AI Knowledge Engine)"
	}

	SOURCES = {
	"reddit": "https://www.reddit.com/search/?q={query}",
	"wikipedia": "https://en.wikipedia.org/wiki/{query}",
	"stack_overflow": "https://stackoverflow.com/search?q={query}",
	"medium": "https://medium.com/search?q={query}"
	}


	def clean_text(text: str) -> str:
	text = re.sub(r'\s+', ' ', text)
	text = re.sub(r'http\S+', '', text)
	text = text.strip()
	return text


	def extract_sentences(text: str, max_len: int = 200) -> List[str]:

	sentences = re.split(r'[.!?]', text)

	cleaned = []

	for s in sentences:

	s = clean_text(s)

	if len(s) > 30 and len(s) < max_len:
	cleaned.append(s)

	return cleaned


	def scrape_page(url: str) -> str:

	try:

	r = requests.get(
	url,
	headers=HEADERS,
	timeout=6
	)

	if r.status_code != 200:
	return ""

	soup = BeautifulSoup(r.text, "html.parser")

	for tag in soup(["script", "style", "noscript"]):
	tag.decompose()

	text = soup.get_text(" ")

	return text

	except Exception:
	return ""


	def scrape_social_knowledge(query: str, limit: int = 30) -> List[Dict]:

	knowledge = []

	for source_name, url in SOURCES.items():

	try:

	full_url = url.format(
	query=query.replace(" ", "+")
	)

	page_text = scrape_page(full_url)

	sentences = extract_sentences(page_text)

	for s in sentences[:limit]:

	knowledge.append({

	"query": query,

	"source": source_name,

	"url": full_url,

	"text": s,

	"timestamp": time.time()

	})

	except Exception:
	continue

	return knowledge