import requests
from bs4 import BeautifulSoup
import re
import time
from typing import List, Dict

HEADERS = {
    "User-Agent": "Mozilla/5.0 (MVI-AI Knowledge Engine)"
}

SOURCES = {
    "reddit": "https://www.reddit.com/search/?q={query}",
    "wikipedia": "https://en.wikipedia.org/wiki/{query}",
    "stack_overflow": "https://stackoverflow.com/search?q={query}",
    "medium": "https://medium.com/search?q={query}"
}


def clean_text(text: str) -> str:
    text = re.sub(r'\s+', ' ', text)
    text = re.sub(r'http\S+', '', text)
    text = text.strip()
    return text


def extract_sentences(text: str, max_len: int = 200) -> List[str]:

    sentences = re.split(r'[.!?]', text)

    cleaned = []

    for s in sentences:

        s = clean_text(s)

        if len(s) > 30 and len(s) < max_len:
            cleaned.append(s)

    return cleaned


def scrape_page(url: str) -> str:

    try:

        r = requests.get(
            url,
            headers=HEADERS,
            timeout=6
        )

        if r.status_code != 200:
            return ""

        soup = BeautifulSoup(r.text, "html.parser")

        for tag in soup(["script", "style", "noscript"]):
            tag.decompose()

        text = soup.get_text(" ")

        return text

    except Exception:
        return ""


def scrape_social_knowledge(query: str, limit: int = 30) -> List[Dict]:

    knowledge = []

    for source_name, url in SOURCES.items():

        try:

            full_url = url.format(
                query=query.replace(" ", "+")
            )

            page_text = scrape_page(full_url)

            sentences = extract_sentences(page_text)

            for s in sentences[:limit]:

                knowledge.append({

                    "query": query,

                    "source": source_name,

                    "url": full_url,

                    "text": s,

                    "timestamp": time.time()

                })

        except Exception:
            continue

    return knowledge