GodsDevProject commited on
Commit
5830944
·
verified ·
1 Parent(s): f1b3a17

Upload 20 files

Browse files
core/__init__.py ADDED
File without changes
core/analytics.py ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ import time
3
+ from collections import Counter
4
+
5
+ _events = Counter()
6
+
7
+ def track(event: str):
8
+ _events[event] += 1
9
+
10
+ def snapshot():
11
+ return {
12
+ "timestamp": int(time.time()),
13
+ "events": dict(_events)
14
+ }
core/appeals.py ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ def draft_appeal(document: str, agency: str, reason: str) -> str:
2
+ return f"""
3
+ FOIA Appeal – Request for Reconsideration
4
+
5
+ Agency: {agency}
6
+ Document: {document}
7
+
8
+ Basis for Appeal:
9
+ {reason}
10
+
11
+ This appeal concerns a publicly released document and requests
12
+ review of redactions or withholdings under applicable FOIA exemptions.
13
+
14
+ Sincerely,
15
+ [Requestor]
16
+ """.strip()
core/async_search.py ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ import asyncio
2
+
3
+ async def fanout_search(adapters, query):
4
+ tasks = [adapter.search(query) for adapter in adapters]
5
+ results = await asyncio.gather(*tasks, return_exceptions=True)
6
+ docs = []
7
+ for r in results:
8
+ if isinstance(r, list):
9
+ docs.extend(r)
10
+ return docs
core/cache.py ADDED
@@ -0,0 +1,53 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import time
2
+ from typing import Dict, Any, List
3
+ from core.faiss_vector import FaissIndex
4
+
5
+ _TTL = 300 # seconds
6
+ _cache: Dict[str, Any] = {}
7
+ _faiss = None
8
+
9
+ def _now():
10
+ return int(time.time())
11
+
12
+ def _get_index():
13
+ global _faiss
14
+ if _faiss is None:
15
+ _faiss = FaissIndex()
16
+ return _faiss
17
+
18
+ def cache_get(key):
19
+ v = _cache.get(key)
20
+ if not v:
21
+ return None
22
+ ts, data = v
23
+ if _now() - ts > _TTL:
24
+ _cache.pop(key, None)
25
+ return None
26
+ return data
27
+
28
+ def cache_set(key, data: List[dict]):
29
+ _cache[key] = (_now(), data)
30
+ # add snippets to FAISS for local semantic recall
31
+ texts = [d.get("snippet","") for d in data if d.get("snippet")]
32
+ if texts:
33
+ try:
34
+ _get_index().add(texts)
35
+ except Exception:
36
+ pass
37
+
38
+ def dedupe(results: List[dict]) -> List[dict]:
39
+ seen = set()
40
+ out = []
41
+ for r in results:
42
+ h = hash((r.get("source"), r.get("url"), r.get("snippet")))
43
+ if h not in seen:
44
+ seen.add(h)
45
+ out.append(r)
46
+ return out
47
+
48
+ def source_counts(results: List[dict]) -> Dict[str,int]:
49
+ counts = {}
50
+ for r in results:
51
+ s = r.get("source","Unknown")
52
+ counts[s] = counts.get(s, 0) + 1
53
+ return counts
core/citations.py ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ def citation_block(result: dict) -> str:
2
+ return f"""---
3
+ Source: {result.get('source')}
4
+ Title: {result.get('title','Unknown')}
5
+ URL: {result.get('url')}
6
+ Retrieved: {result.get('retrieved_at','N/A')}
7
+ ---"""
core/cluster.py ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import List, Dict
2
+ from core.faiss_vector import FaissIndex
3
+
4
+ def cluster_results(results: List[dict], k: int = 5) -> Dict[str, List[dict]]:
5
+ texts = [r.get("snippet","") for r in results if r.get("snippet")]
6
+ index = FaissIndex()
7
+ index.add(texts)
8
+ clusters = {}
9
+ for r in results:
10
+ key = r.get("source","Unknown")
11
+ clusters.setdefault(key, []).append(r)
12
+ return clusters
core/explain.py ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ def explain(results):
2
+ return {
3
+ "total_results": len(results),
4
+ "sources": list(set(r.get("source") for r in results)),
5
+ "methods": [
6
+ "Public FOIA reading room search",
7
+ "Async fan-out querying",
8
+ "Deduplication",
9
+ "Semantic refinement (FAISS)"
10
+ ],
11
+ "no_restricted_access": True
12
+ }
core/faiss_vector.py ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ try:
2
+ import faiss
3
+ from sentence_transformers import SentenceTransformer
4
+ except ImportError:
5
+ faiss = None
6
+
7
+ class FaissIndex:
8
+ def __init__(self, model_name="all-MiniLM-L6-v2"):
9
+ if faiss is None:
10
+ raise RuntimeError("FAISS not installed")
11
+ self.model = SentenceTransformer(model_name)
12
+ self.index = None
13
+ self.docs = []
14
+
15
+ def add(self, texts):
16
+ emb = self.model.encode(texts)
17
+ if self.index is None:
18
+ self.index = faiss.IndexFlatL2(emb.shape[1])
19
+ self.index.add(emb)
20
+ self.docs.extend(texts)
21
+
22
+ def search(self, query, k=5):
23
+ emb = self.model.encode([query])
24
+ D, I = self.index.search(emb, k)
25
+ return [self.docs[i] for i in I[0] if i < len(self.docs)]
core/health.py ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ class HealthRegistry:
2
+ def __init__(self):
3
+ self.state = {}
4
+
5
+ def update(self, agency, healthy=True):
6
+ self.state[agency] = healthy
7
+
8
+ def is_enabled(self, agency):
9
+ return self.state.get(agency, True)
10
+
11
+ def badge(self, agency):
12
+ return "🟢 Healthy" if self.is_enabled(agency) else "🔴 Disabled"
core/index.py ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os, json, numpy as np
2
+ from core.vector import embed
3
+
4
+ def load_docs(base="data/demo"):
5
+ meta = json.load(open(os.path.join(base, "metadata.json")))
6
+ docs = []
7
+ for fname, m in meta.items():
8
+ text = open(os.path.join(base, "documents", fname), encoding="utf-8").read()
9
+ docs.append({
10
+ "id": fname,
11
+ "text": text,
12
+ "vec": embed(text),
13
+ "agency": m["agency"],
14
+ "year": m["year"],
15
+ "program": m.get("program", "Unknown")
16
+ })
17
+ return docs
core/journalist.py ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import zipfile, json, os, time
2
+
3
+ def journalist_export(results, out_path):
4
+ index = []
5
+ with zipfile.ZipFile(out_path, "w", zipfile.ZIP_DEFLATED) as z:
6
+ for i,r in enumerate(results):
7
+ meta = {
8
+ "source": r.get("source"),
9
+ "url": r.get("url"),
10
+ "snippet": r.get("snippet"),
11
+ "timestamp": time.time()
12
+ }
13
+ name = f"doc_{i}.json"
14
+ z.writestr(name, json.dumps(meta, indent=2))
15
+ index.append(meta)
16
+ z.writestr("INDEX.json", json.dumps(index, indent=2))
17
+ return out_path
core/multi_program.py ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import Dict, List
2
+
3
+ def compare_programs(docs: List[dict]) -> Dict[str, Dict[str, int]]:
4
+ matrix: Dict[str, Dict[str, int]] = {}
5
+
6
+ for d in docs:
7
+ agency = d.get("agency", "Unknown")
8
+ year = d.get("date", "")[:4]
9
+
10
+ if not year.isdigit():
11
+ continue
12
+
13
+ matrix.setdefault(agency, {})
14
+ matrix[agency][year] = matrix[agency].get(year, 0) + 1
15
+
16
+ return matrix
core/preview.py ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ def safe_preview(url: str):
2
+ if not url:
3
+ return "<i>No preview available</i>"
4
+ blocked = ["pdf", "download"]
5
+ if any(b in url.lower() for b in blocked):
6
+ return "<i>Preview disabled (redaction-protected document)</i>"
7
+ return f"<iframe src='{url}' width='100%' height='400'></iframe>"
core/redaction.py ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ def redaction_confidence(result: dict) -> float:
2
+ url = result.get("url","").lower()
3
+ score = 0.9
4
+ if "pdf" in url: score -= 0.3
5
+ if "redact" in url: score -= 0.4
6
+ if "download" in url: score -= 0.2
7
+ return max(0.0, score)
core/saved.py ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json, hashlib, time
2
+
3
+ _STORE = {}
4
+
5
+ def save_search(query, sources):
6
+ payload = {"q": query, "s": sources, "t": int(time.time())}
7
+ key = hashlib.sha256(json.dumps(payload).encode()).hexdigest()[:12]
8
+ _STORE[key] = payload
9
+ return key
10
+
11
+ def load_search(key):
12
+ return _STORE.get(key)
core/search.py ADDED
@@ -0,0 +1,39 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import List, TypedDict
2
+ import json
3
+ import os
4
+
5
+ DATA_PATH = "data/index/demo_index.json"
6
+
7
+ class SearchResult(TypedDict):
8
+ document: str
9
+ agency: str
10
+ date: str
11
+ excerpt: str
12
+ citation: str
13
+ score: float
14
+
15
+
16
+ def search_docs(query: str) -> List[SearchResult]:
17
+ query_l = query.lower()
18
+ results: List[SearchResult] = []
19
+
20
+ if not os.path.exists(DATA_PATH):
21
+ return results
22
+
23
+ with open(DATA_PATH, "r") as f:
24
+ docs = json.load(f)
25
+
26
+ for d in docs:
27
+ text = d["excerpt"].lower()
28
+ if query_l in text:
29
+ score = text.count(query_l) / max(len(text), 1)
30
+ results.append({
31
+ "document": d["document"],
32
+ "agency": d["agency"],
33
+ "date": d["date"],
34
+ "excerpt": d["excerpt"],
35
+ "citation": d["citation"],
36
+ "score": round(score, 4)
37
+ })
38
+
39
+ return sorted(results, key=lambda x: x["score"], reverse=True)
core/semantic_refine.py ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import List
2
+ from core.faiss_vector import FaissIndex
3
+
4
+ class SemanticRefiner:
5
+ def __init__(self):
6
+ self.index = FaissIndex()
7
+
8
+ def build_from_results(self, results: List[dict]):
9
+ texts = [r.get("snippet","") for r in results if r.get("snippet")]
10
+ if texts:
11
+ self.index.add(texts)
12
+
13
+ def refine(self, query: str, k: int = 10):
14
+ return self.index.search(query, k)
core/throttle.py ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import time
2
+
3
+ class AgencyThrottle:
4
+ def __init__(self, min_interval=1.0):
5
+ self.min_interval = min_interval
6
+ self.last_call = 0
7
+ self.healthy = True
8
+
9
+ async def wait(self):
10
+ delta = time.time() - self.last_call
11
+ if delta < self.min_interval:
12
+ await asyncio.sleep(self.min_interval - delta)
13
+ self.last_call = time.time()
14
+
15
+ def mark_unhealthy(self):
16
+ self.healthy = False
core/vector.py ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np
2
+ import re
3
+
4
+ DIM = 512
5
+
6
+ def embed(text: str) -> np.ndarray:
7
+ v = np.zeros(DIM, dtype=np.float32)
8
+ for w in re.findall(r"[a-zA-Z]{2,}", text.lower()):
9
+ v[hash(w) % DIM] += 1.0
10
+ n = np.linalg.norm(v)
11
+ return v / n if n > 0 else v