Spaces:

Musombi
/

mvi-ai-engine

Sleeping

App Files Files Community

Musombi commited on Mar 11

Commit

078de65

1 Parent(s): d46301b

Update reasoning/scraper.py

Browse files

Files changed (1) hide show

reasoning/scraper.py +38 -13

reasoning/scraper.py CHANGED Viewed

@@ -1,10 +1,11 @@
 import requests
 from bs4 import BeautifulSoup
 import re
 from typing import List, Dict
 HEADERS = {
-    "User-Agent": "Mozilla/5.0 (AI Knowledge Collector)"
 }
 SOURCES = {
@@ -14,19 +15,25 @@ SOURCES = {
     "medium": "https://medium.com/search?q={query}"
 }
 def clean_text(text: str) -> str:
     text = re.sub(r'\s+', ' ', text)
     text = re.sub(r'http\S+', '', text)
-    return text.strip()
-def extract_sentences(text: str, max_len=200) -> List[str]:
     sentences = re.split(r'[.!?]', text)
     cleaned = []
     for s in sentences:
         s = clean_text(s)
-        if len(s) > 20 and len(s) < max_len:
             cleaned.append(s)
     return cleaned
@@ -35,7 +42,12 @@ def extract_sentences(text: str, max_len=200) -> List[str]:
 def scrape_page(url: str) -> str:
     try:
-        r = requests.get(url, headers=HEADERS, timeout=6)
         if r.status_code != 200:
             return ""
@@ -45,31 +57,44 @@ def scrape_page(url: str) -> str:
         for tag in soup(["script", "style", "noscript"]):
             tag.decompose()
-        return soup.get_text(" ")
     except Exception:
         return ""
-def scrape_social_knowledge(query: str, limit: int = 40) -> List[Dict]:
     knowledge = []
-    for name, url in SOURCES.items():
         try:
-            full_url = url.format(query=query.replace(" ", "+"))
-            text = scrape_page(full_url)
-            sentences = extract_sentences(text)
             for s in sentences[:limit]:
                 knowledge.append({
-                    "source": name,
-                    "text": s
                 })
         except Exception:

 import requests
 from bs4 import BeautifulSoup
 import re
+import time
 from typing import List, Dict
 HEADERS = {
+    "User-Agent": "Mozilla/5.0 (MVI-AI Knowledge Engine)"
 }
 SOURCES = {
     "medium": "https://medium.com/search?q={query}"
 }
 def clean_text(text: str) -> str:
     text = re.sub(r'\s+', ' ', text)
     text = re.sub(r'http\S+', '', text)
+    text = text.strip()
+    return text
+def extract_sentences(text: str, max_len: int = 200) -> List[str]:
     sentences = re.split(r'[.!?]', text)
     cleaned = []
     for s in sentences:
         s = clean_text(s)
+        if len(s) > 30 and len(s) < max_len:
             cleaned.append(s)
     return cleaned
 def scrape_page(url: str) -> str:
     try:
+        r = requests.get(
+            url,
+            headers=HEADERS,
+            timeout=6
+        )
         if r.status_code != 200:
             return ""
         for tag in soup(["script", "style", "noscript"]):
             tag.decompose()
+        text = soup.get_text(" ")
+        return text
     except Exception:
         return ""
+def scrape_social_knowledge(query: str, limit: int = 30) -> List[Dict]:
     knowledge = []
+    for source_name, url in SOURCES.items():
         try:
+            full_url = url.format(
+                query=query.replace(" ", "+")
+            )
+            page_text = scrape_page(full_url)
+            sentences = extract_sentences(page_text)
             for s in sentences[:limit]:
                 knowledge.append({
+                    "query": query,
+                    "source": source_name,
+                    "url": full_url,
+                    "text": s,
+                    "timestamp": time.time()
                 })
         except Exception: