GodsDevProject commited on
Commit
893401e
·
verified ·
1 Parent(s): 0155c1c

Create ingest/loader.py

Browse files
Files changed (1) hide show
  1. ingest/loader.py +24 -0
ingest/loader.py ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import requests
2
+ from bs4 import BeautifulSoup
3
+ from typing import List, Dict
4
+
5
+ def ingest_documents(enable_scraping: bool = False) -> List[Dict]:
6
+ if not enable_scraping:
7
+ return []
8
+
9
+ # HF-safe: capped, read-only metadata fetch
10
+ docs = []
11
+ try:
12
+ r = requests.get("https://vault.fbi.gov", timeout=10)
13
+ soup = BeautifulSoup(r.text, "html.parser")
14
+ for link in soup.select("a")[:10]:
15
+ docs.append({
16
+ "title": link.text.strip(),
17
+ "agency": "FBI",
18
+ "date": "",
19
+ "content": link.get("href", "")
20
+ })
21
+ except Exception:
22
+ pass
23
+
24
+ return docs