Spaces:
Running
Running
| import requests | |
| from bs4 import BeautifulSoup | |
| import re | |
| import time | |
| from typing import List, Dict | |
| HEADERS = { | |
| "User-Agent": "Mozilla/5.0 (MVI-AI Knowledge Engine)" | |
| } | |
| SOURCES = { | |
| "reddit": "https://www.reddit.com/search/?q={query}", | |
| "wikipedia": "https://en.wikipedia.org/wiki/{query}", | |
| "stack_overflow": "https://stackoverflow.com/search?q={query}", | |
| "medium": "https://medium.com/search?q={query}" | |
| } | |
| def clean_text(text: str) -> str: | |
| text = re.sub(r'\s+', ' ', text) | |
| text = re.sub(r'http\S+', '', text) | |
| text = text.strip() | |
| return text | |
| def extract_sentences(text: str, max_len: int = 200) -> List[str]: | |
| sentences = re.split(r'[.!?]', text) | |
| cleaned = [] | |
| for s in sentences: | |
| s = clean_text(s) | |
| if len(s) > 30 and len(s) < max_len: | |
| cleaned.append(s) | |
| return cleaned | |
| def scrape_page(url: str) -> str: | |
| try: | |
| r = requests.get( | |
| url, | |
| headers=HEADERS, | |
| timeout=6 | |
| ) | |
| if r.status_code != 200: | |
| return "" | |
| soup = BeautifulSoup(r.text, "html.parser") | |
| for tag in soup(["script", "style", "noscript"]): | |
| tag.decompose() | |
| text = soup.get_text(" ") | |
| return text | |
| except Exception: | |
| return "" | |
| def scrape_social_knowledge(query: str, limit: int = 30) -> List[Dict]: | |
| knowledge = [] | |
| for source_name, url in SOURCES.items(): | |
| try: | |
| full_url = url.format( | |
| query=query.replace(" ", "+") | |
| ) | |
| page_text = scrape_page(full_url) | |
| sentences = extract_sentences(page_text) | |
| for s in sentences[:limit]: | |
| knowledge.append({ | |
| "query": query, | |
| "source": source_name, | |
| "url": full_url, | |
| "text": s, | |
| "timestamp": time.time() | |
| }) | |
| except Exception: | |
| continue | |
| return knowledge |