codelion commited on
Commit
620638a
·
verified ·
1 Parent(s): bbbff8b

Switch to synchronous dataset push (no CommitScheduler), add HF_TOKEN for private dataset access

Browse files
Files changed (1) hide show
  1. app.py +53 -52
app.py CHANGED
@@ -6,13 +6,11 @@ import urllib.parse
6
  import urllib.request
7
  from datetime import datetime
8
  from html.parser import HTMLParser
9
- from pathlib import Path
10
  from uuid import uuid4
11
 
12
  import gradio as gr
13
  from PIL import Image, ImageDraw, ImageFont
14
  from adaptive_classifier import AdaptiveClassifier
15
- from huggingface_hub import CommitScheduler
16
 
17
  # ---------------------------------------------------------------------------
18
  # Model
@@ -26,24 +24,43 @@ print("Model loaded!")
26
  # ---------------------------------------------------------------------------
27
  # Persistent dataset via CommitScheduler
28
  # ---------------------------------------------------------------------------
29
- DATA_DIR = Path("prediction_data")
30
- DATA_DIR.mkdir(parents=True, exist_ok=True)
31
- DATA_FILE = DATA_DIR / f"predictions-{uuid4()}.jsonl"
32
-
33
- scheduler = CommitScheduler(
34
- repo_id="adaptive-classifier/ai-detector-data",
35
- repo_type="dataset",
36
- folder_path=DATA_DIR,
37
- path_in_repo="data",
38
- private=True,
39
- )
40
 
41
- # In-memory index for lookups by ID (populated from current session)
42
- _predictions = {}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
43
 
44
 
45
  def save_prediction(pred_id: str, text: str, url: str, label: str, confidence: float):
46
- """Save a prediction to the dataset."""
47
  record = {
48
  "id": pred_id,
49
  "text": text,
@@ -54,61 +71,45 @@ def save_prediction(pred_id: str, text: str, url: str, label: str, confidence: f
54
  "timestamp": datetime.now().isoformat(),
55
  }
56
  _predictions[pred_id] = record
57
- with scheduler.lock:
58
- with DATA_FILE.open("a") as f:
59
- json.dump(record, f)
60
- f.write("\n")
61
 
62
 
63
  def lookup_prediction(pred_id: str) -> dict | None:
64
- """Look up a prediction by ID — check memory first, then dataset files."""
65
  if pred_id in _predictions:
66
  return _predictions[pred_id]
67
- # Search local data files (includes data synced from HF)
68
- for f in sorted(DATA_DIR.glob("*.jsonl")):
69
- try:
70
- for line in f.read_text().strip().split("\n"):
71
- if not line:
72
- continue
73
- rec = json.loads(line)
74
- if rec.get("id") == pred_id and "text" in rec:
75
- _predictions[pred_id] = rec # cache it
76
- return rec
77
- except Exception:
78
- continue
79
- # Try downloading from HF dataset
80
  try:
81
- from huggingface_hub import HfApi
82
- api = HfApi()
83
- files = api.list_repo_files("adaptive-classifier/ai-detector-data", repo_type="dataset")
84
- data_files = [f for f in files if f.startswith("data/") and f.endswith(".jsonl")]
85
- for df in data_files:
86
- path = api.hf_hub_download("adaptive-classifier/ai-detector-data", df, repo_type="dataset")
87
- for line in open(path).read().strip().split("\n"):
88
- if not line:
89
- continue
90
- rec = json.loads(line)
91
- if rec.get("id") == pred_id and "text" in rec:
92
- _predictions[pred_id] = rec
93
- return rec
94
  except Exception:
95
  pass
96
  return None
97
 
98
 
99
  def save_feedback(pred_id: str, feedback: str):
100
- """Save user feedback for a prediction."""
101
  record = {
102
  "id": pred_id,
103
  "feedback": feedback,
104
  "timestamp": datetime.now().isoformat(),
105
  }
106
- with scheduler.lock:
107
- with DATA_FILE.open("a") as f:
108
- json.dump(record, f)
109
- f.write("\n")
110
  if pred_id in _predictions:
111
  _predictions[pred_id]["feedback"] = feedback
 
 
 
 
112
 
113
 
114
  # ---------------------------------------------------------------------------
 
6
  import urllib.request
7
  from datetime import datetime
8
  from html.parser import HTMLParser
 
9
  from uuid import uuid4
10
 
11
  import gradio as gr
12
  from PIL import Image, ImageDraw, ImageFont
13
  from adaptive_classifier import AdaptiveClassifier
 
14
 
15
  # ---------------------------------------------------------------------------
16
  # Model
 
24
  # ---------------------------------------------------------------------------
25
  # Persistent dataset via CommitScheduler
26
  # ---------------------------------------------------------------------------
27
+ DATASET_REPO = "adaptive-classifier/ai-detector-data"
28
+ _predictions = {} # In-memory cache
29
+ _hf_api = None
30
+
31
+
32
+ def _get_api():
33
+ global _hf_api
34
+ if _hf_api is None:
35
+ from huggingface_hub import HfApi
36
+ _hf_api = HfApi()
37
+ return _hf_api
38
 
39
+
40
+ def _push_record(record: dict):
41
+ """Append a record to the dataset by uploading a JSONL file."""
42
+ import io
43
+ api = _get_api()
44
+ line = json.dumps(record) + "\n"
45
+ # Append to a single predictions.jsonl file
46
+ # Download existing, append, re-upload
47
+ try:
48
+ path = api.hf_hub_download(DATASET_REPO, "data/predictions.jsonl", repo_type="dataset")
49
+ existing = open(path).read()
50
+ except Exception:
51
+ existing = ""
52
+ content = existing + line
53
+ api.upload_file(
54
+ path_or_fileobj=io.BytesIO(content.encode()),
55
+ path_in_repo="data/predictions.jsonl",
56
+ repo_id=DATASET_REPO,
57
+ repo_type="dataset",
58
+ commit_message=f"Add prediction {record.get('id', '')}",
59
+ )
60
 
61
 
62
  def save_prediction(pred_id: str, text: str, url: str, label: str, confidence: float):
63
+ """Save a prediction to memory and push to HF dataset."""
64
  record = {
65
  "id": pred_id,
66
  "text": text,
 
71
  "timestamp": datetime.now().isoformat(),
72
  }
73
  _predictions[pred_id] = record
74
+ try:
75
+ _push_record(record)
76
+ except Exception as e:
77
+ print(f"Warning: failed to push prediction to dataset: {e}")
78
 
79
 
80
  def lookup_prediction(pred_id: str) -> dict | None:
81
+ """Look up a prediction by ID — check memory, then HF dataset."""
82
  if pred_id in _predictions:
83
  return _predictions[pred_id]
84
+ # Download from HF dataset
 
 
 
 
 
 
 
 
 
 
 
 
85
  try:
86
+ api = _get_api()
87
+ path = api.hf_hub_download(DATASET_REPO, "data/predictions.jsonl", repo_type="dataset")
88
+ for line in open(path).read().strip().split("\n"):
89
+ if not line:
90
+ continue
91
+ rec = json.loads(line)
92
+ if rec.get("id") == pred_id and "text" in rec:
93
+ _predictions[pred_id] = rec
94
+ return rec
 
 
 
 
95
  except Exception:
96
  pass
97
  return None
98
 
99
 
100
  def save_feedback(pred_id: str, feedback: str):
101
+ """Save user feedback to memory and push to HF dataset."""
102
  record = {
103
  "id": pred_id,
104
  "feedback": feedback,
105
  "timestamp": datetime.now().isoformat(),
106
  }
 
 
 
 
107
  if pred_id in _predictions:
108
  _predictions[pred_id]["feedback"] = feedback
109
+ try:
110
+ _push_record(record)
111
+ except Exception as e:
112
+ print(f"Warning: failed to push feedback to dataset: {e}")
113
 
114
 
115
  # ---------------------------------------------------------------------------