AerdnaNami commited on
Commit
d39f04a
·
1 Parent(s): 955515c

Using datasets space

Browse files
Files changed (2) hide show
  1. app.py +181 -23
  2. requirements.txt +2 -1
app.py CHANGED
@@ -2,13 +2,27 @@ import sys
2
  import os
3
  import json
4
  import html
 
 
5
  from datetime import datetime
6
  from pathlib import Path
7
  import gradio as gr
 
 
 
 
 
 
 
8
 
9
  DATA_DIR = Path(__file__).parent / "data"
10
  PAPERS_DIR = DATA_DIR / "papers"
11
  ANNOTATIONS_ROOT = DATA_DIR / "user_annotations"
 
 
 
 
 
12
  CATEGORIES = ["Unsupported claim", "Format", "Coherence", "Lacks synthesis"]
13
  CATEGORY_COLORS = {
14
  "Unsupported claim": "#ffb3b3",
@@ -30,6 +44,111 @@ LEGEND_HTML = """
30
  </div>
31
  """
32
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
33
  CATEGORY_JS = """
34
  (start, end) => {
35
  const findById = (id) => {
@@ -90,16 +209,24 @@ def list_papers():
90
 
91
  def list_user_ids():
92
  ids = set()
93
- if ANNOTATIONS_ROOT.exists():
94
- for path in ANNOTATIONS_ROOT.iterdir():
95
- if path.is_dir():
96
- ids.add(path.name)
97
- # Legacy single-file annotations
98
- base = Path(__file__).parent
99
- for path in base.glob("annotations_*.json"):
100
- stem = path.stem
101
- if stem.startswith("annotations_"):
102
- ids.add(stem[len("annotations_") :])
 
 
 
 
 
 
 
 
103
  return sorted(ids)
104
 
105
 
@@ -206,6 +333,19 @@ def _annotations_path(user_id, filename):
206
 
207
 
208
  def _load_annotations(user_id, filename=None):
 
 
 
 
 
 
 
 
 
 
 
 
 
209
  user_dir = _user_dir(user_id)
210
  if not user_dir:
211
  return []
@@ -253,6 +393,16 @@ def _load_annotations(user_id, filename=None):
253
 
254
 
255
  def _save_annotations(user_id, filename, records):
 
 
 
 
 
 
 
 
 
 
256
  path = _annotations_path(user_id, filename)
257
  if not path:
258
  raise ValueError("Missing user ID or filename.")
@@ -308,6 +458,8 @@ def save_annotation(user_id, filename, start, end, category):
308
  _save_annotations(user_id, filename, records)
309
  content = _render_with_highlights(text, _annotations_for(user_id, filename))
310
  annotations_json = load_annotations_for(user_id, filename)
 
 
311
  try:
312
  path = _annotations_path(user_id, filename)
313
  size = os.path.getsize(path)
@@ -360,6 +512,8 @@ def remove_annotation(user_id, filename, start, end):
360
  annotations_json = json.dumps(filtered, ensure_ascii=False, indent=2)
361
  if removed == 0:
362
  return "No overlapping highlights to remove.", content, annotations_json
 
 
363
  try:
364
  path = _annotations_path(user_id, filename)
365
  size = os.path.getsize(path)
@@ -378,19 +532,22 @@ def clear_all_annotations(user_id):
378
  if not user_id:
379
  return "Please enter your ID.", gr.update(), gr.update()
380
  try:
381
- user_dir = _user_dir(user_id)
382
- if user_dir and user_dir.exists():
383
- for path in user_dir.glob("*.json"):
 
 
 
 
 
 
 
 
 
384
  try:
385
- path.unlink()
386
  except Exception:
387
  pass
388
- legacy = Path(__file__).parent / f"annotations_{_normalize_user_id(user_id)}.json"
389
- if legacy.exists():
390
- try:
391
- legacy.unlink()
392
- except Exception:
393
- pass
394
  return "Cleared all annotations.", "<em>No file selected.</em>", "[]"
395
  except Exception as e:
396
  return f"Error clearing annotations: {e}", gr.update(), gr.update()
@@ -409,9 +566,10 @@ def _add_user_id(new_id):
409
  safe = _normalize_user_id(new_id)
410
  if not safe:
411
  return gr.update(), "**Please enter a valid ID.**", gr.update()
412
- ANNOTATIONS_ROOT.mkdir(parents=True, exist_ok=True)
413
- user_dir = ANNOTATIONS_ROOT / safe
414
- user_dir.mkdir(parents=True, exist_ok=True)
 
415
  choices = list_user_ids()
416
  if safe not in choices:
417
  choices.append(safe)
 
2
  import os
3
  import json
4
  import html
5
+ import time
6
+ import tempfile
7
  from datetime import datetime
8
  from pathlib import Path
9
  import gradio as gr
10
+ try:
11
+ from huggingface_hub import HfApi, hf_hub_download
12
+ from huggingface_hub.utils import EntryNotFoundError
13
+ except Exception:
14
+ HfApi = None
15
+ hf_hub_download = None
16
+ EntryNotFoundError = Exception
17
 
18
  DATA_DIR = Path(__file__).parent / "data"
19
  PAPERS_DIR = DATA_DIR / "papers"
20
  ANNOTATIONS_ROOT = DATA_DIR / "user_annotations"
21
+ HF_DATASET_REPO = os.getenv("HF_DATASET_REPO", "ImanAndrea/citation_annotations")
22
+ HF_ANNOTATIONS_PREFIX = os.getenv("HF_ANNOTATIONS_PREFIX", "annotations")
23
+ HF_TOKEN = os.getenv("HF_TOKEN") or os.getenv("HUGGINGFACEHUB_API_TOKEN")
24
+ HF_CACHE_DIR = Path(os.getenv("HF_CACHE_DIR", "/tmp/hf_cache"))
25
+ HF_CACHE_DIR.mkdir(parents=True, exist_ok=True)
26
  CATEGORIES = ["Unsupported claim", "Format", "Coherence", "Lacks synthesis"]
27
  CATEGORY_COLORS = {
28
  "Unsupported claim": "#ffb3b3",
 
44
  </div>
45
  """
46
 
47
+ _REPO_FILES_CACHE = {"ts": 0.0, "files": []}
48
+
49
+
50
+ def _use_hf_backend():
51
+ return bool(HF_TOKEN) and HF_DATASET_REPO and HfApi is not None and hf_hub_download is not None
52
+
53
+
54
+ def _hf_api():
55
+ if not _use_hf_backend():
56
+ return None
57
+ return HfApi(token=HF_TOKEN)
58
+
59
+
60
+ def _hf_repo_path(user_id, filename):
61
+ safe_user = _normalize_user_id(user_id)
62
+ if not safe_user:
63
+ return None
64
+ safe_file = _sanitize_filename(Path(filename).name)
65
+ return f"{HF_ANNOTATIONS_PREFIX}/{safe_user}/{safe_file}.json"
66
+
67
+
68
+ def _hf_list_repo_files(ttl_seconds=30):
69
+ now = time.time()
70
+ if now - _REPO_FILES_CACHE["ts"] < ttl_seconds:
71
+ return _REPO_FILES_CACHE["files"]
72
+ api = _hf_api()
73
+ if not api:
74
+ return []
75
+ try:
76
+ files = api.list_repo_files(repo_id=HF_DATASET_REPO, repo_type="dataset")
77
+ except Exception:
78
+ files = []
79
+ _REPO_FILES_CACHE["ts"] = now
80
+ _REPO_FILES_CACHE["files"] = files
81
+ return files
82
+
83
+
84
+ def _hf_download_annotations(repo_path):
85
+ if not repo_path:
86
+ return []
87
+ try:
88
+ local_path = hf_hub_download(
89
+ repo_id=HF_DATASET_REPO,
90
+ repo_type="dataset",
91
+ filename=repo_path,
92
+ token=HF_TOKEN,
93
+ local_dir=str(HF_CACHE_DIR),
94
+ )
95
+ except EntryNotFoundError:
96
+ return []
97
+ except Exception:
98
+ return []
99
+ try:
100
+ with open(local_path, "r", encoding="utf-8") as f:
101
+ data = json.load(f)
102
+ return data if isinstance(data, list) else []
103
+ except Exception:
104
+ return []
105
+
106
+
107
+ def _hf_upload_annotations(repo_path, records, commit_message):
108
+ api = _hf_api()
109
+ if not api:
110
+ raise RuntimeError("Hugging Face backend unavailable.")
111
+ HF_CACHE_DIR.mkdir(parents=True, exist_ok=True)
112
+ tmp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".json", dir=str(HF_CACHE_DIR))
113
+ try:
114
+ with open(tmp_file.name, "w", encoding="utf-8") as f:
115
+ json.dump(records, f, ensure_ascii=False, indent=2)
116
+ api.upload_file(
117
+ path_or_fileobj=tmp_file.name,
118
+ path_in_repo=repo_path,
119
+ repo_id=HF_DATASET_REPO,
120
+ repo_type="dataset",
121
+ commit_message=commit_message,
122
+ )
123
+ finally:
124
+ try:
125
+ os.remove(tmp_file.name)
126
+ except Exception:
127
+ pass
128
+ _REPO_FILES_CACHE["ts"] = 0.0
129
+
130
+
131
+ def _hf_delete_user_annotations(user_id):
132
+ api = _hf_api()
133
+ if not api:
134
+ raise RuntimeError("Hugging Face backend unavailable.")
135
+ safe_user = _normalize_user_id(user_id)
136
+ if not safe_user:
137
+ return
138
+ prefix = f"{HF_ANNOTATIONS_PREFIX}/{safe_user}/"
139
+ files = [f for f in _hf_list_repo_files() if f.startswith(prefix)]
140
+ for path in files:
141
+ try:
142
+ api.delete_file(
143
+ repo_id=HF_DATASET_REPO,
144
+ repo_type="dataset",
145
+ path_in_repo=path,
146
+ commit_message=f"Delete annotations for {safe_user}",
147
+ )
148
+ except Exception:
149
+ pass
150
+ _REPO_FILES_CACHE["ts"] = 0.0
151
+
152
  CATEGORY_JS = """
153
  (start, end) => {
154
  const findById = (id) => {
 
209
 
210
  def list_user_ids():
211
  ids = set()
212
+ if _use_hf_backend():
213
+ for path in _hf_list_repo_files():
214
+ if not path.startswith(f"{HF_ANNOTATIONS_PREFIX}/"):
215
+ continue
216
+ parts = path.split("/")
217
+ if len(parts) >= 3 and parts[1]:
218
+ ids.add(parts[1])
219
+ else:
220
+ if ANNOTATIONS_ROOT.exists():
221
+ for path in ANNOTATIONS_ROOT.iterdir():
222
+ if path.is_dir():
223
+ ids.add(path.name)
224
+ # Legacy single-file annotations
225
+ base = Path(__file__).parent
226
+ for path in base.glob("annotations_*.json"):
227
+ stem = path.stem
228
+ if stem.startswith("annotations_"):
229
+ ids.add(stem[len("annotations_") :])
230
  return sorted(ids)
231
 
232
 
 
333
 
334
 
335
  def _load_annotations(user_id, filename=None):
336
+ if _use_hf_backend():
337
+ if filename:
338
+ repo_path = _hf_repo_path(user_id, filename)
339
+ return _hf_download_annotations(repo_path)
340
+ safe_user = _normalize_user_id(user_id)
341
+ if not safe_user:
342
+ return []
343
+ prefix = f"{HF_ANNOTATIONS_PREFIX}/{safe_user}/"
344
+ records = []
345
+ for path in _hf_list_repo_files():
346
+ if path.startswith(prefix) and path.endswith(".json"):
347
+ records.extend(_hf_download_annotations(path))
348
+ return records
349
  user_dir = _user_dir(user_id)
350
  if not user_dir:
351
  return []
 
393
 
394
 
395
  def _save_annotations(user_id, filename, records):
396
+ if _use_hf_backend():
397
+ repo_path = _hf_repo_path(user_id, filename)
398
+ if not repo_path:
399
+ raise ValueError("Missing user ID or filename.")
400
+ _hf_upload_annotations(
401
+ repo_path,
402
+ records,
403
+ commit_message=f"Update annotations for {_normalize_user_id(user_id)}/{_sanitize_filename(filename)}",
404
+ )
405
+ return
406
  path = _annotations_path(user_id, filename)
407
  if not path:
408
  raise ValueError("Missing user ID or filename.")
 
458
  _save_annotations(user_id, filename, records)
459
  content = _render_with_highlights(text, _annotations_for(user_id, filename))
460
  annotations_json = load_annotations_for(user_id, filename)
461
+ if _use_hf_backend():
462
+ return f"Saved annotation for {filename}.", content, annotations_json
463
  try:
464
  path = _annotations_path(user_id, filename)
465
  size = os.path.getsize(path)
 
512
  annotations_json = json.dumps(filtered, ensure_ascii=False, indent=2)
513
  if removed == 0:
514
  return "No overlapping highlights to remove.", content, annotations_json
515
+ if _use_hf_backend():
516
+ return f"Removed {removed} highlight(s).", content, annotations_json
517
  try:
518
  path = _annotations_path(user_id, filename)
519
  size = os.path.getsize(path)
 
532
  if not user_id:
533
  return "Please enter your ID.", gr.update(), gr.update()
534
  try:
535
+ if _use_hf_backend():
536
+ _hf_delete_user_annotations(user_id)
537
+ else:
538
+ user_dir = _user_dir(user_id)
539
+ if user_dir and user_dir.exists():
540
+ for path in user_dir.glob("*.json"):
541
+ try:
542
+ path.unlink()
543
+ except Exception:
544
+ pass
545
+ legacy = Path(__file__).parent / f"annotations_{_normalize_user_id(user_id)}.json"
546
+ if legacy.exists():
547
  try:
548
+ legacy.unlink()
549
  except Exception:
550
  pass
 
 
 
 
 
 
551
  return "Cleared all annotations.", "<em>No file selected.</em>", "[]"
552
  except Exception as e:
553
  return f"Error clearing annotations: {e}", gr.update(), gr.update()
 
566
  safe = _normalize_user_id(new_id)
567
  if not safe:
568
  return gr.update(), "**Please enter a valid ID.**", gr.update()
569
+ if not _use_hf_backend():
570
+ ANNOTATIONS_ROOT.mkdir(parents=True, exist_ok=True)
571
+ user_dir = ANNOTATIONS_ROOT / safe
572
+ user_dir.mkdir(parents=True, exist_ok=True)
573
  choices = list_user_ids()
574
  if safe not in choices:
575
  choices.append(safe)
requirements.txt CHANGED
@@ -3,6 +3,7 @@ pandas
3
  numpy
4
  transformers
5
  datasets
 
6
  torch
7
  scikit-learn
8
- nltk
 
3
  numpy
4
  transformers
5
  datasets
6
+ huggingface_hub
7
  torch
8
  scikit-learn
9
+ nltk