Spaces:

GodsDevProject
/

FOIA_Declassified_Document_Search

Sleeping

GodsDevProject commited on Jan 9

Commit

893401e

verified ·

1 Parent(s): 0155c1c

Create ingest/loader.py

Files changed (1) hide show

ingest/loader.py ADDED Viewed

+import requests
+from bs4 import BeautifulSoup
+from typing import List, Dict
+def ingest_documents(enable_scraping: bool = False) -> List[Dict]:
+    if not enable_scraping:
+        return []
+    # HF-safe: capped, read-only metadata fetch
+    docs = []
+    try:
+        r = requests.get("https://vault.fbi.gov", timeout=10)
+        soup = BeautifulSoup(r.text, "html.parser")
+        for link in soup.select("a")[:10]:
+            docs.append({
+                "title": link.text.strip(),
+                "agency": "FBI",
+                "date": "",
+                "content": link.get("href", "")
+            })
+    except Exception:
+        pass
+    return docs