Spaces:

GodsDevProject
/

FOIA_Doc_Search

Sleeping

App Files Files Community

GodsDevProject commited on Jan 10

Commit

5830944

verified ·

1 Parent(s): f1b3a17

Upload 20 files

Browse files

Files changed (20) hide show

core/__init__.py +0 -0
core/analytics.py +14 -0
core/appeals.py +16 -0
core/async_search.py +10 -0
core/cache.py +53 -0
core/citations.py +7 -0
core/cluster.py +12 -0
core/explain.py +12 -0
core/faiss_vector.py +25 -0
core/health.py +12 -0
core/index.py +17 -0
core/journalist.py +17 -0
core/multi_program.py +16 -0
core/preview.py +7 -0
core/redaction.py +7 -0
core/saved.py +12 -0
core/search.py +39 -0
core/semantic_refine.py +14 -0
core/throttle.py +16 -0
core/vector.py +11 -0

core/__init__.py ADDED Viewed

File without changes

core/analytics.py ADDED Viewed

	@@ -0,0 +1,14 @@

+import time
+from collections import Counter
+_events = Counter()
+def track(event: str):
+    _events[event] += 1
+def snapshot():
+    return {
+        "timestamp": int(time.time()),
+        "events": dict(_events)
+    }

core/appeals.py ADDED Viewed

	@@ -0,0 +1,16 @@

+def draft_appeal(document: str, agency: str, reason: str) -> str:
+    return f"""
+FOIA Appeal – Request for Reconsideration
+Agency: {agency}
+Document: {document}
+Basis for Appeal:
+{reason}
+This appeal concerns a publicly released document and requests
+review of redactions or withholdings under applicable FOIA exemptions.
+Sincerely,
+[Requestor]
+""".strip()

core/async_search.py ADDED Viewed

	@@ -0,0 +1,10 @@

+import asyncio
+async def fanout_search(adapters, query):
+    tasks = [adapter.search(query) for adapter in adapters]
+    results = await asyncio.gather(*tasks, return_exceptions=True)
+    docs = []
+    for r in results:
+        if isinstance(r, list):
+            docs.extend(r)
+    return docs

core/cache.py ADDED Viewed

	@@ -0,0 +1,53 @@

+import time
+from typing import Dict, Any, List
+from core.faiss_vector import FaissIndex
+_TTL = 300  # seconds
+_cache: Dict[str, Any] = {}
+_faiss = None
+def _now():
+    return int(time.time())
+def _get_index():
+    global _faiss
+    if _faiss is None:
+        _faiss = FaissIndex()
+    return _faiss
+def cache_get(key):
+    v = _cache.get(key)
+    if not v:
+        return None
+    ts, data = v
+    if _now() - ts > _TTL:
+        _cache.pop(key, None)
+        return None
+    return data
+def cache_set(key, data: List[dict]):
+    _cache[key] = (_now(), data)
+    # add snippets to FAISS for local semantic recall
+    texts = [d.get("snippet","") for d in data if d.get("snippet")]
+    if texts:
+        try:
+            _get_index().add(texts)
+        except Exception:
+            pass
+def dedupe(results: List[dict]) -> List[dict]:
+    seen = set()
+    out = []
+    for r in results:
+        h = hash((r.get("source"), r.get("url"), r.get("snippet")))
+        if h not in seen:
+            seen.add(h)
+            out.append(r)
+    return out
+def source_counts(results: List[dict]) -> Dict[str,int]:
+    counts = {}
+    for r in results:
+        s = r.get("source","Unknown")
+        counts[s] = counts.get(s, 0) + 1
+    return counts

core/citations.py ADDED Viewed

	@@ -0,0 +1,7 @@

+def citation_block(result: dict) -> str:
+    return f"""---
+Source: {result.get('source')}
+Title: {result.get('title','Unknown')}
+URL: {result.get('url')}
+Retrieved: {result.get('retrieved_at','N/A')}
+---"""

core/cluster.py ADDED Viewed

	@@ -0,0 +1,12 @@

+from typing import List, Dict
+from core.faiss_vector import FaissIndex
+def cluster_results(results: List[dict], k: int = 5) -> Dict[str, List[dict]]:
+    texts = [r.get("snippet","") for r in results if r.get("snippet")]
+    index = FaissIndex()
+    index.add(texts)
+    clusters = {}
+    for r in results:
+        key = r.get("source","Unknown")
+        clusters.setdefault(key, []).append(r)
+    return clusters

core/explain.py ADDED Viewed

	@@ -0,0 +1,12 @@

+def explain(results):
+    return {
+        "total_results": len(results),
+        "sources": list(set(r.get("source") for r in results)),
+        "methods": [
+            "Public FOIA reading room search",
+            "Async fan-out querying",
+            "Deduplication",
+            "Semantic refinement (FAISS)"
+        ],
+        "no_restricted_access": True
+    }

core/faiss_vector.py ADDED Viewed

	@@ -0,0 +1,25 @@

+try:
+    import faiss
+    from sentence_transformers import SentenceTransformer
+except ImportError:
+    faiss = None
+class FaissIndex:
+    def __init__(self, model_name="all-MiniLM-L6-v2"):
+        if faiss is None:
+            raise RuntimeError("FAISS not installed")
+        self.model = SentenceTransformer(model_name)
+        self.index = None
+        self.docs = []
+    def add(self, texts):
+        emb = self.model.encode(texts)
+        if self.index is None:
+            self.index = faiss.IndexFlatL2(emb.shape[1])
+        self.index.add(emb)
+        self.docs.extend(texts)
+    def search(self, query, k=5):
+        emb = self.model.encode([query])
+        D, I = self.index.search(emb, k)
+        return [self.docs[i] for i in I[0] if i < len(self.docs)]

core/health.py ADDED Viewed

	@@ -0,0 +1,12 @@

+class HealthRegistry:
+    def __init__(self):
+        self.state = {}
+    def update(self, agency, healthy=True):
+        self.state[agency] = healthy
+    def is_enabled(self, agency):
+        return self.state.get(agency, True)
+    def badge(self, agency):
+        return "🟢 Healthy" if self.is_enabled(agency) else "🔴 Disabled"

core/index.py ADDED Viewed

	@@ -0,0 +1,17 @@

+import os, json, numpy as np
+from core.vector import embed
+def load_docs(base="data/demo"):
+    meta = json.load(open(os.path.join(base, "metadata.json")))
+    docs = []
+    for fname, m in meta.items():
+        text = open(os.path.join(base, "documents", fname), encoding="utf-8").read()
+        docs.append({
+            "id": fname,
+            "text": text,
+            "vec": embed(text),
+            "agency": m["agency"],
+            "year": m["year"],
+            "program": m.get("program", "Unknown")
+        })
+    return docs

core/journalist.py ADDED Viewed

	@@ -0,0 +1,17 @@

+import zipfile, json, os, time
+def journalist_export(results, out_path):
+    index = []
+    with zipfile.ZipFile(out_path, "w", zipfile.ZIP_DEFLATED) as z:
+        for i,r in enumerate(results):
+            meta = {
+                "source": r.get("source"),
+                "url": r.get("url"),
+                "snippet": r.get("snippet"),
+                "timestamp": time.time()
+            }
+            name = f"doc_{i}.json"
+            z.writestr(name, json.dumps(meta, indent=2))
+            index.append(meta)
+        z.writestr("INDEX.json", json.dumps(index, indent=2))
+    return out_path

core/multi_program.py ADDED Viewed

	@@ -0,0 +1,16 @@

+from typing import Dict, List
+def compare_programs(docs: List[dict]) -> Dict[str, Dict[str, int]]:
+    matrix: Dict[str, Dict[str, int]] = {}
+    for d in docs:
+        agency = d.get("agency", "Unknown")
+        year = d.get("date", "")[:4]
+        if not year.isdigit():
+            continue
+        matrix.setdefault(agency, {})
+        matrix[agency][year] = matrix[agency].get(year, 0) + 1
+    return matrix

core/preview.py ADDED Viewed

	@@ -0,0 +1,7 @@

+def safe_preview(url: str):
+    if not url:
+        return "<i>No preview available</i>"
+    blocked = ["pdf", "download"]
+    if any(b in url.lower() for b in blocked):
+        return "<i>Preview disabled (redaction-protected document)</i>"
+    return f"<iframe src='{url}' width='100%' height='400'></iframe>"

core/redaction.py ADDED Viewed

	@@ -0,0 +1,7 @@

+def redaction_confidence(result: dict) -> float:
+    url = result.get("url","").lower()
+    score = 0.9
+    if "pdf" in url: score -= 0.3
+    if "redact" in url: score -= 0.4
+    if "download" in url: score -= 0.2
+    return max(0.0, score)

core/saved.py ADDED Viewed

	@@ -0,0 +1,12 @@

+import json, hashlib, time
+_STORE = {}
+def save_search(query, sources):
+    payload = {"q": query, "s": sources, "t": int(time.time())}
+    key = hashlib.sha256(json.dumps(payload).encode()).hexdigest()[:12]
+    _STORE[key] = payload
+    return key
+def load_search(key):
+    return _STORE.get(key)

core/search.py ADDED Viewed

	@@ -0,0 +1,39 @@

+from typing import List, TypedDict
+import json
+import os
+DATA_PATH = "data/index/demo_index.json"
+class SearchResult(TypedDict):
+    document: str
+    agency: str
+    date: str
+    excerpt: str
+    citation: str
+    score: float
+def search_docs(query: str) -> List[SearchResult]:
+    query_l = query.lower()
+    results: List[SearchResult] = []
+    if not os.path.exists(DATA_PATH):
+        return results
+    with open(DATA_PATH, "r") as f:
+        docs = json.load(f)
+    for d in docs:
+        text = d["excerpt"].lower()
+        if query_l in text:
+            score = text.count(query_l) / max(len(text), 1)
+            results.append({
+                "document": d["document"],
+                "agency": d["agency"],
+                "date": d["date"],
+                "excerpt": d["excerpt"],
+                "citation": d["citation"],
+                "score": round(score, 4)
+            })
+    return sorted(results, key=lambda x: x["score"], reverse=True)

core/semantic_refine.py ADDED Viewed

	@@ -0,0 +1,14 @@

+from typing import List
+from core.faiss_vector import FaissIndex
+class SemanticRefiner:
+    def __init__(self):
+        self.index = FaissIndex()
+    def build_from_results(self, results: List[dict]):
+        texts = [r.get("snippet","") for r in results if r.get("snippet")]
+        if texts:
+            self.index.add(texts)
+    def refine(self, query: str, k: int = 10):
+        return self.index.search(query, k)

core/throttle.py ADDED Viewed

	@@ -0,0 +1,16 @@

+import time
+class AgencyThrottle:
+    def __init__(self, min_interval=1.0):
+        self.min_interval = min_interval
+        self.last_call = 0
+        self.healthy = True
+    async def wait(self):
+        delta = time.time() - self.last_call
+        if delta < self.min_interval:
+            await asyncio.sleep(self.min_interval - delta)
+        self.last_call = time.time()
+    def mark_unhealthy(self):
+        self.healthy = False

core/vector.py ADDED Viewed

	@@ -0,0 +1,11 @@

+import numpy as np
+import re
+DIM = 512
+def embed(text: str) -> np.ndarray:
+    v = np.zeros(DIM, dtype=np.float32)
+    for w in re.findall(r"[a-zA-Z]{2,}", text.lower()):
+        v[hash(w) % DIM] += 1.0
+    n = np.linalg.norm(v)
+    return v / n if n > 0 else v