Spaces:
Sleeping
Sleeping
Upload 20 files
Browse files- core/__init__.py +0 -0
- core/analytics.py +14 -0
- core/appeals.py +16 -0
- core/async_search.py +10 -0
- core/cache.py +53 -0
- core/citations.py +7 -0
- core/cluster.py +12 -0
- core/explain.py +12 -0
- core/faiss_vector.py +25 -0
- core/health.py +12 -0
- core/index.py +17 -0
- core/journalist.py +17 -0
- core/multi_program.py +16 -0
- core/preview.py +7 -0
- core/redaction.py +7 -0
- core/saved.py +12 -0
- core/search.py +39 -0
- core/semantic_refine.py +14 -0
- core/throttle.py +16 -0
- core/vector.py +11 -0
core/__init__.py
ADDED
|
File without changes
|
core/analytics.py
ADDED
|
@@ -0,0 +1,14 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
|
| 2 |
+
import time
|
| 3 |
+
from collections import Counter
|
| 4 |
+
|
| 5 |
+
_events = Counter()
|
| 6 |
+
|
| 7 |
+
def track(event: str):
|
| 8 |
+
_events[event] += 1
|
| 9 |
+
|
| 10 |
+
def snapshot():
|
| 11 |
+
return {
|
| 12 |
+
"timestamp": int(time.time()),
|
| 13 |
+
"events": dict(_events)
|
| 14 |
+
}
|
core/appeals.py
ADDED
|
@@ -0,0 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
def draft_appeal(document: str, agency: str, reason: str) -> str:
|
| 2 |
+
return f"""
|
| 3 |
+
FOIA Appeal – Request for Reconsideration
|
| 4 |
+
|
| 5 |
+
Agency: {agency}
|
| 6 |
+
Document: {document}
|
| 7 |
+
|
| 8 |
+
Basis for Appeal:
|
| 9 |
+
{reason}
|
| 10 |
+
|
| 11 |
+
This appeal concerns a publicly released document and requests
|
| 12 |
+
review of redactions or withholdings under applicable FOIA exemptions.
|
| 13 |
+
|
| 14 |
+
Sincerely,
|
| 15 |
+
[Requestor]
|
| 16 |
+
""".strip()
|
core/async_search.py
ADDED
|
@@ -0,0 +1,10 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import asyncio
|
| 2 |
+
|
| 3 |
+
async def fanout_search(adapters, query):
|
| 4 |
+
tasks = [adapter.search(query) for adapter in adapters]
|
| 5 |
+
results = await asyncio.gather(*tasks, return_exceptions=True)
|
| 6 |
+
docs = []
|
| 7 |
+
for r in results:
|
| 8 |
+
if isinstance(r, list):
|
| 9 |
+
docs.extend(r)
|
| 10 |
+
return docs
|
core/cache.py
ADDED
|
@@ -0,0 +1,53 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import time
|
| 2 |
+
from typing import Dict, Any, List
|
| 3 |
+
from core.faiss_vector import FaissIndex
|
| 4 |
+
|
| 5 |
+
_TTL = 300 # seconds
|
| 6 |
+
_cache: Dict[str, Any] = {}
|
| 7 |
+
_faiss = None
|
| 8 |
+
|
| 9 |
+
def _now():
|
| 10 |
+
return int(time.time())
|
| 11 |
+
|
| 12 |
+
def _get_index():
|
| 13 |
+
global _faiss
|
| 14 |
+
if _faiss is None:
|
| 15 |
+
_faiss = FaissIndex()
|
| 16 |
+
return _faiss
|
| 17 |
+
|
| 18 |
+
def cache_get(key):
|
| 19 |
+
v = _cache.get(key)
|
| 20 |
+
if not v:
|
| 21 |
+
return None
|
| 22 |
+
ts, data = v
|
| 23 |
+
if _now() - ts > _TTL:
|
| 24 |
+
_cache.pop(key, None)
|
| 25 |
+
return None
|
| 26 |
+
return data
|
| 27 |
+
|
| 28 |
+
def cache_set(key, data: List[dict]):
|
| 29 |
+
_cache[key] = (_now(), data)
|
| 30 |
+
# add snippets to FAISS for local semantic recall
|
| 31 |
+
texts = [d.get("snippet","") for d in data if d.get("snippet")]
|
| 32 |
+
if texts:
|
| 33 |
+
try:
|
| 34 |
+
_get_index().add(texts)
|
| 35 |
+
except Exception:
|
| 36 |
+
pass
|
| 37 |
+
|
| 38 |
+
def dedupe(results: List[dict]) -> List[dict]:
|
| 39 |
+
seen = set()
|
| 40 |
+
out = []
|
| 41 |
+
for r in results:
|
| 42 |
+
h = hash((r.get("source"), r.get("url"), r.get("snippet")))
|
| 43 |
+
if h not in seen:
|
| 44 |
+
seen.add(h)
|
| 45 |
+
out.append(r)
|
| 46 |
+
return out
|
| 47 |
+
|
| 48 |
+
def source_counts(results: List[dict]) -> Dict[str,int]:
|
| 49 |
+
counts = {}
|
| 50 |
+
for r in results:
|
| 51 |
+
s = r.get("source","Unknown")
|
| 52 |
+
counts[s] = counts.get(s, 0) + 1
|
| 53 |
+
return counts
|
core/citations.py
ADDED
|
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
def citation_block(result: dict) -> str:
|
| 2 |
+
return f"""---
|
| 3 |
+
Source: {result.get('source')}
|
| 4 |
+
Title: {result.get('title','Unknown')}
|
| 5 |
+
URL: {result.get('url')}
|
| 6 |
+
Retrieved: {result.get('retrieved_at','N/A')}
|
| 7 |
+
---"""
|
core/cluster.py
ADDED
|
@@ -0,0 +1,12 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from typing import List, Dict
|
| 2 |
+
from core.faiss_vector import FaissIndex
|
| 3 |
+
|
| 4 |
+
def cluster_results(results: List[dict], k: int = 5) -> Dict[str, List[dict]]:
|
| 5 |
+
texts = [r.get("snippet","") for r in results if r.get("snippet")]
|
| 6 |
+
index = FaissIndex()
|
| 7 |
+
index.add(texts)
|
| 8 |
+
clusters = {}
|
| 9 |
+
for r in results:
|
| 10 |
+
key = r.get("source","Unknown")
|
| 11 |
+
clusters.setdefault(key, []).append(r)
|
| 12 |
+
return clusters
|
core/explain.py
ADDED
|
@@ -0,0 +1,12 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
def explain(results):
|
| 2 |
+
return {
|
| 3 |
+
"total_results": len(results),
|
| 4 |
+
"sources": list(set(r.get("source") for r in results)),
|
| 5 |
+
"methods": [
|
| 6 |
+
"Public FOIA reading room search",
|
| 7 |
+
"Async fan-out querying",
|
| 8 |
+
"Deduplication",
|
| 9 |
+
"Semantic refinement (FAISS)"
|
| 10 |
+
],
|
| 11 |
+
"no_restricted_access": True
|
| 12 |
+
}
|
core/faiss_vector.py
ADDED
|
@@ -0,0 +1,25 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
try:
|
| 2 |
+
import faiss
|
| 3 |
+
from sentence_transformers import SentenceTransformer
|
| 4 |
+
except ImportError:
|
| 5 |
+
faiss = None
|
| 6 |
+
|
| 7 |
+
class FaissIndex:
|
| 8 |
+
def __init__(self, model_name="all-MiniLM-L6-v2"):
|
| 9 |
+
if faiss is None:
|
| 10 |
+
raise RuntimeError("FAISS not installed")
|
| 11 |
+
self.model = SentenceTransformer(model_name)
|
| 12 |
+
self.index = None
|
| 13 |
+
self.docs = []
|
| 14 |
+
|
| 15 |
+
def add(self, texts):
|
| 16 |
+
emb = self.model.encode(texts)
|
| 17 |
+
if self.index is None:
|
| 18 |
+
self.index = faiss.IndexFlatL2(emb.shape[1])
|
| 19 |
+
self.index.add(emb)
|
| 20 |
+
self.docs.extend(texts)
|
| 21 |
+
|
| 22 |
+
def search(self, query, k=5):
|
| 23 |
+
emb = self.model.encode([query])
|
| 24 |
+
D, I = self.index.search(emb, k)
|
| 25 |
+
return [self.docs[i] for i in I[0] if i < len(self.docs)]
|
core/health.py
ADDED
|
@@ -0,0 +1,12 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
class HealthRegistry:
|
| 2 |
+
def __init__(self):
|
| 3 |
+
self.state = {}
|
| 4 |
+
|
| 5 |
+
def update(self, agency, healthy=True):
|
| 6 |
+
self.state[agency] = healthy
|
| 7 |
+
|
| 8 |
+
def is_enabled(self, agency):
|
| 9 |
+
return self.state.get(agency, True)
|
| 10 |
+
|
| 11 |
+
def badge(self, agency):
|
| 12 |
+
return "🟢 Healthy" if self.is_enabled(agency) else "🔴 Disabled"
|
core/index.py
ADDED
|
@@ -0,0 +1,17 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os, json, numpy as np
|
| 2 |
+
from core.vector import embed
|
| 3 |
+
|
| 4 |
+
def load_docs(base="data/demo"):
|
| 5 |
+
meta = json.load(open(os.path.join(base, "metadata.json")))
|
| 6 |
+
docs = []
|
| 7 |
+
for fname, m in meta.items():
|
| 8 |
+
text = open(os.path.join(base, "documents", fname), encoding="utf-8").read()
|
| 9 |
+
docs.append({
|
| 10 |
+
"id": fname,
|
| 11 |
+
"text": text,
|
| 12 |
+
"vec": embed(text),
|
| 13 |
+
"agency": m["agency"],
|
| 14 |
+
"year": m["year"],
|
| 15 |
+
"program": m.get("program", "Unknown")
|
| 16 |
+
})
|
| 17 |
+
return docs
|
core/journalist.py
ADDED
|
@@ -0,0 +1,17 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import zipfile, json, os, time
|
| 2 |
+
|
| 3 |
+
def journalist_export(results, out_path):
|
| 4 |
+
index = []
|
| 5 |
+
with zipfile.ZipFile(out_path, "w", zipfile.ZIP_DEFLATED) as z:
|
| 6 |
+
for i,r in enumerate(results):
|
| 7 |
+
meta = {
|
| 8 |
+
"source": r.get("source"),
|
| 9 |
+
"url": r.get("url"),
|
| 10 |
+
"snippet": r.get("snippet"),
|
| 11 |
+
"timestamp": time.time()
|
| 12 |
+
}
|
| 13 |
+
name = f"doc_{i}.json"
|
| 14 |
+
z.writestr(name, json.dumps(meta, indent=2))
|
| 15 |
+
index.append(meta)
|
| 16 |
+
z.writestr("INDEX.json", json.dumps(index, indent=2))
|
| 17 |
+
return out_path
|
core/multi_program.py
ADDED
|
@@ -0,0 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from typing import Dict, List
|
| 2 |
+
|
| 3 |
+
def compare_programs(docs: List[dict]) -> Dict[str, Dict[str, int]]:
|
| 4 |
+
matrix: Dict[str, Dict[str, int]] = {}
|
| 5 |
+
|
| 6 |
+
for d in docs:
|
| 7 |
+
agency = d.get("agency", "Unknown")
|
| 8 |
+
year = d.get("date", "")[:4]
|
| 9 |
+
|
| 10 |
+
if not year.isdigit():
|
| 11 |
+
continue
|
| 12 |
+
|
| 13 |
+
matrix.setdefault(agency, {})
|
| 14 |
+
matrix[agency][year] = matrix[agency].get(year, 0) + 1
|
| 15 |
+
|
| 16 |
+
return matrix
|
core/preview.py
ADDED
|
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
def safe_preview(url: str):
|
| 2 |
+
if not url:
|
| 3 |
+
return "<i>No preview available</i>"
|
| 4 |
+
blocked = ["pdf", "download"]
|
| 5 |
+
if any(b in url.lower() for b in blocked):
|
| 6 |
+
return "<i>Preview disabled (redaction-protected document)</i>"
|
| 7 |
+
return f"<iframe src='{url}' width='100%' height='400'></iframe>"
|
core/redaction.py
ADDED
|
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
def redaction_confidence(result: dict) -> float:
|
| 2 |
+
url = result.get("url","").lower()
|
| 3 |
+
score = 0.9
|
| 4 |
+
if "pdf" in url: score -= 0.3
|
| 5 |
+
if "redact" in url: score -= 0.4
|
| 6 |
+
if "download" in url: score -= 0.2
|
| 7 |
+
return max(0.0, score)
|
core/saved.py
ADDED
|
@@ -0,0 +1,12 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import json, hashlib, time
|
| 2 |
+
|
| 3 |
+
_STORE = {}
|
| 4 |
+
|
| 5 |
+
def save_search(query, sources):
|
| 6 |
+
payload = {"q": query, "s": sources, "t": int(time.time())}
|
| 7 |
+
key = hashlib.sha256(json.dumps(payload).encode()).hexdigest()[:12]
|
| 8 |
+
_STORE[key] = payload
|
| 9 |
+
return key
|
| 10 |
+
|
| 11 |
+
def load_search(key):
|
| 12 |
+
return _STORE.get(key)
|
core/search.py
ADDED
|
@@ -0,0 +1,39 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from typing import List, TypedDict
|
| 2 |
+
import json
|
| 3 |
+
import os
|
| 4 |
+
|
| 5 |
+
DATA_PATH = "data/index/demo_index.json"
|
| 6 |
+
|
| 7 |
+
class SearchResult(TypedDict):
|
| 8 |
+
document: str
|
| 9 |
+
agency: str
|
| 10 |
+
date: str
|
| 11 |
+
excerpt: str
|
| 12 |
+
citation: str
|
| 13 |
+
score: float
|
| 14 |
+
|
| 15 |
+
|
| 16 |
+
def search_docs(query: str) -> List[SearchResult]:
|
| 17 |
+
query_l = query.lower()
|
| 18 |
+
results: List[SearchResult] = []
|
| 19 |
+
|
| 20 |
+
if not os.path.exists(DATA_PATH):
|
| 21 |
+
return results
|
| 22 |
+
|
| 23 |
+
with open(DATA_PATH, "r") as f:
|
| 24 |
+
docs = json.load(f)
|
| 25 |
+
|
| 26 |
+
for d in docs:
|
| 27 |
+
text = d["excerpt"].lower()
|
| 28 |
+
if query_l in text:
|
| 29 |
+
score = text.count(query_l) / max(len(text), 1)
|
| 30 |
+
results.append({
|
| 31 |
+
"document": d["document"],
|
| 32 |
+
"agency": d["agency"],
|
| 33 |
+
"date": d["date"],
|
| 34 |
+
"excerpt": d["excerpt"],
|
| 35 |
+
"citation": d["citation"],
|
| 36 |
+
"score": round(score, 4)
|
| 37 |
+
})
|
| 38 |
+
|
| 39 |
+
return sorted(results, key=lambda x: x["score"], reverse=True)
|
core/semantic_refine.py
ADDED
|
@@ -0,0 +1,14 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from typing import List
|
| 2 |
+
from core.faiss_vector import FaissIndex
|
| 3 |
+
|
| 4 |
+
class SemanticRefiner:
|
| 5 |
+
def __init__(self):
|
| 6 |
+
self.index = FaissIndex()
|
| 7 |
+
|
| 8 |
+
def build_from_results(self, results: List[dict]):
|
| 9 |
+
texts = [r.get("snippet","") for r in results if r.get("snippet")]
|
| 10 |
+
if texts:
|
| 11 |
+
self.index.add(texts)
|
| 12 |
+
|
| 13 |
+
def refine(self, query: str, k: int = 10):
|
| 14 |
+
return self.index.search(query, k)
|
core/throttle.py
ADDED
|
@@ -0,0 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import time
|
| 2 |
+
|
| 3 |
+
class AgencyThrottle:
|
| 4 |
+
def __init__(self, min_interval=1.0):
|
| 5 |
+
self.min_interval = min_interval
|
| 6 |
+
self.last_call = 0
|
| 7 |
+
self.healthy = True
|
| 8 |
+
|
| 9 |
+
async def wait(self):
|
| 10 |
+
delta = time.time() - self.last_call
|
| 11 |
+
if delta < self.min_interval:
|
| 12 |
+
await asyncio.sleep(self.min_interval - delta)
|
| 13 |
+
self.last_call = time.time()
|
| 14 |
+
|
| 15 |
+
def mark_unhealthy(self):
|
| 16 |
+
self.healthy = False
|
core/vector.py
ADDED
|
@@ -0,0 +1,11 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import numpy as np
|
| 2 |
+
import re
|
| 3 |
+
|
| 4 |
+
DIM = 512
|
| 5 |
+
|
| 6 |
+
def embed(text: str) -> np.ndarray:
|
| 7 |
+
v = np.zeros(DIM, dtype=np.float32)
|
| 8 |
+
for w in re.findall(r"[a-zA-Z]{2,}", text.lower()):
|
| 9 |
+
v[hash(w) % DIM] += 1.0
|
| 10 |
+
n = np.linalg.norm(v)
|
| 11 |
+
return v / n if n > 0 else v
|