Spaces:

Musombi
/

mvi-ai-engine

Running

App Files Files Community

Musombi commited on 8 days ago

Commit

6701acf

1 Parent(s): 2639f51

Update reasoning/scraper.py

Browse files

Files changed (1) hide show

reasoning/scraper.py +34 -85

reasoning/scraper.py CHANGED Viewed

@@ -1,10 +1,9 @@
-import aiohttp
-import asyncio
 import re
 import time
 import json
 import os
-from bs4 import BeautifulSoup
 from typing import List, Dict
 HEADERS = {
@@ -19,7 +18,7 @@ DDG_SEARCH = "https://duckduckgo.com/html/?q={query}"
 # -------------------------
-# CACHE (PERSISTENT)
 # -------------------------
 def load_cache():
@@ -36,38 +35,19 @@ CACHE = load_cache()
 # -------------------------
-# QUERY PROCESSING
 # -------------------------
 def normalize_query(query: str) -> str:
     query = query.lower()
-    stop_phrases = [
-        "what is", "who is", "define",
-        "explain", "tell me about",
-        "what are", "how does"
-    ]
-    for phrase in stop_phrases:
         query = query.replace(phrase, "")
     query = re.sub(r"[^\w\s]", "", query)
     return query.strip()
-def expand_query(query: str) -> List[str]:
-    return [
-        query,
-        f"{query} definition",
-        f"{query} meaning",
-        f"{query} explanation"
-    ]
-# -------------------------
-# CLEANING
-# -------------------------
 def clean_text(text: str) -> str:
     text = re.sub(r"\[\d+\]", "", text)
     text = re.sub(r"\s+", " ", text)
@@ -78,23 +58,22 @@ def clean_text(text: str) -> str:
 # FETCH
 # -------------------------
-async def fetch(session, url):
     try:
-        async with session.get(url, headers=HEADERS, timeout=8) as r:
-            if r.status != 200:
-                return ""
-            return await r.text()
     except:
         return ""
 # -------------------------
-# PARSERS
 # -------------------------
 def extract_paragraphs(html: str) -> List[str]:
     soup = BeautifulSoup(html, "html.parser")
     paragraphs = soup.find_all("p")
     results = []
@@ -115,29 +94,25 @@ def extract_wiki(html: str) -> List[str]:
 # -------------------------
-# SEARCH FALLBACKS
 # -------------------------
-async def wikipedia_search(session, query):
-    url = WIKI_SEARCH.format(query=query.replace(" ", "+"))
-    html = await fetch(session, url)
     soup = BeautifulSoup(html, "html.parser")
-    result = soup.select_one(".mw-search-result-heading a")
     if result:
         return "https://en.wikipedia.org" + result.get("href")
     return ""
-async def duckduckgo_search(session, query):
-    url = DDG_SEARCH.format(query=query.replace(" ", "+"))
-    html = await fetch(session, url)
     soup = BeautifulSoup(html, "html.parser")
-    links = []
     for a in soup.select(".result__a"):
         href = a.get("href")
         if href and href.startswith("http"):
@@ -150,73 +125,41 @@ async def duckduckgo_search(session, query):
 # SCRAPERS
 # -------------------------
-async def scrape_wikipedia(session, query):
     url = WIKI_PAGE.format(query=query.replace(" ", "_"))
-    html = await fetch(session, url)
     if "Wikipedia does not have an article" in html:
-        url = await wikipedia_search(session, query)
         if not url:
             return []
-        html = await fetch(session, url)
     return extract_wiki(html)
-async def scrape_generic(session, url):
-    html = await fetch(session, url)
     return extract_paragraphs(html)
 # -------------------------
-# RANKING (TF STYLE)
 # -------------------------
 def rank_results(paragraphs: List[str], query: str) -> List[str]:
     q_words = set(query.lower().split())
     def score(p):
-        words = set(p.lower().split())
-        return len(q_words & words)
     return sorted(paragraphs, key=score, reverse=True)
 # -------------------------
-# MAIN ENGINE
 # -------------------------
-async def async_scrape(query: str, limit: int):
-    async with aiohttp.ClientSession() as session:
-        queries = expand_query(query)
-        tasks = []
-        # Wikipedia tasks
-        for q in queries:
-            tasks.append(scrape_wikipedia(session, q))
-        results = await asyncio.gather(*tasks)
-        paragraphs = []
-        for r in results:
-            paragraphs.extend(r)
-        # Fallback to DuckDuckGo if empty
-        if not paragraphs:
-            links = await duckduckgo_search(session, query)
-            tasks = [scrape_generic(session, link) for link in links]
-            results = await asyncio.gather(*tasks)
-            for r in results:
-                paragraphs.extend(r)
-        return paragraphs
 def scrape_knowledge(query: str, limit: int = 5) -> List[Dict]:
     if query in CACHE:
         return CACHE[query]
@@ -225,7 +168,13 @@ def scrape_knowledge(query: str, limit: int = 5) -> List[Dict]:
     if not clean_query:
         return []
-    paragraphs = asyncio.run(async_scrape(clean_query, limit))
     if not paragraphs:
         return []

+import requests
+from bs4 import BeautifulSoup
 import re
 import time
 import json
 import os
 from typing import List, Dict
 HEADERS = {
 # -------------------------
+# CACHE
 # -------------------------
 def load_cache():
 # -------------------------
+# UTIL
 # -------------------------
 def normalize_query(query: str) -> str:
     query = query.lower()
+    for phrase in ["what is", "who is", "define", "explain"]:
         query = query.replace(phrase, "")
     query = re.sub(r"[^\w\s]", "", query)
     return query.strip()
 def clean_text(text: str) -> str:
     text = re.sub(r"\[\d+\]", "", text)
     text = re.sub(r"\s+", " ", text)
 # FETCH
 # -------------------------
+def fetch(url: str) -> str:
     try:
+        r = requests.get(url, headers=HEADERS, timeout=8)
+        if r.status_code != 200:
+            return ""
+        return r.text
     except:
         return ""
 # -------------------------
+# PARSE
 # -------------------------
 def extract_paragraphs(html: str) -> List[str]:
     soup = BeautifulSoup(html, "html.parser")
     paragraphs = soup.find_all("p")
     results = []
 # -------------------------
+# SEARCH FALLBACK
 # -------------------------
+def wikipedia_search(query: str) -> str:
+    html = fetch(WIKI_SEARCH.format(query=query.replace(" ", "+")))
     soup = BeautifulSoup(html, "html.parser")
+    result = soup.select_one(".mw-search-result-heading a")
     if result:
         return "https://en.wikipedia.org" + result.get("href")
     return ""
+def duckduckgo_search(query: str) -> List[str]:
+    html = fetch(DDG_SEARCH.format(query=query.replace(" ", "+")))
     soup = BeautifulSoup(html, "html.parser")
+    links = []
     for a in soup.select(".result__a"):
         href = a.get("href")
         if href and href.startswith("http"):
 # SCRAPERS
 # -------------------------
+def scrape_wikipedia(query: str) -> List[str]:
     url = WIKI_PAGE.format(query=query.replace(" ", "_"))
+    html = fetch(url)
     if "Wikipedia does not have an article" in html:
+        url = wikipedia_search(query)
         if not url:
             return []
+        html = fetch(url)
     return extract_wiki(html)
+def scrape_generic(url: str) -> List[str]:
+    html = fetch(url)
     return extract_paragraphs(html)
 # -------------------------
+# RANKING
 # -------------------------
 def rank_results(paragraphs: List[str], query: str) -> List[str]:
     q_words = set(query.lower().split())
     def score(p):
+        return sum(word in p.lower() for word in q_words)
     return sorted(paragraphs, key=score, reverse=True)
 # -------------------------
+# MAIN
 # -------------------------
 def scrape_knowledge(query: str, limit: int = 5) -> List[Dict]:
     if query in CACHE:
         return CACHE[query]
     if not clean_query:
         return []
+    paragraphs = scrape_wikipedia(clean_query)
+    if not paragraphs:
+        links = duckduckgo_search(clean_query)
+        for link in links:
+            paragraphs.extend(scrape_generic(link))
     if not paragraphs:
         return []