Jin Zhu commited on
Commit
0fa0487
·
1 Parent(s): 3ef9054
Files changed (4) hide show
  1. .gitignore +3 -1
  2. src/FineTune/.gitignore +0 -1
  3. src/app.py +182 -84
  4. src/feedback.py +55 -53
.gitignore CHANGED
@@ -1 +1,3 @@
1
- cache/*
 
 
 
1
+ cache/*
2
+ src/feedback_data/*
3
+ src/__pycache__/*
src/FineTune/.gitignore CHANGED
@@ -8,7 +8,6 @@ __pycache__/
8
 
9
  # Distribution / packaging
10
  .Python
11
- ckpt/*
12
  logs/*/
13
  models/*/
14
  build/
 
8
 
9
  # Distribution / packaging
10
  .Python
 
11
  logs/*/
12
  models/*/
13
  build/
src/app.py CHANGED
@@ -33,6 +33,9 @@ if os.environ.get('SPACE_ID'):
33
  import streamlit as st
34
  from FineTune.model import ComputeStat
35
  import time
 
 
 
36
 
37
  # -----------------
38
  # Page Configuration
@@ -117,9 +120,87 @@ FEEDBACK_DATASET_ID = os.environ.get('FEEDBACK_DATASET_ID', 'mamba413/user-feedb
117
  feedback_manager = FeedbackManager(
118
  dataset_repo_id=FEEDBACK_DATASET_ID,
119
  hf_token=os.environ.get('HF_TOKEN'),
120
- local_backup=True # 保留本地备份
121
  )
122
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
123
  # -----------------
124
  # Configuration
125
  # -----------------
@@ -128,8 +209,7 @@ MODEL_CONFIG = {
128
  'from_pretrained': './src/FineTune/ckpt/',
129
  'base_model': 'gemma-1b',
130
  'cache_dir': '../cache',
131
- # 'device': 'mps',
132
- 'device': 'cpu',
133
  # 'device': 'cuda',
134
  }
135
 
@@ -198,7 +278,8 @@ with col1:
198
  height=200,
199
  )
200
 
201
- detect_clicked = st.button("Detect", type="primary", use_container_width=True)
 
202
 
203
  selected_domain = st.selectbox(
204
  label="⚙️ Domain (Optional)",
@@ -231,9 +312,7 @@ if detect_clicked:
231
  if not text_input.strip():
232
  st.warning("⚠️ Please enter some text before detecting.")
233
  else:
234
- # ========== Reset feedback state ==========
235
  st.session_state.feedback_given = False
236
- # ==========================================
237
 
238
  # Start timing to decide whether to show progress bar
239
  start_time = time.time()
@@ -268,84 +347,8 @@ if detect_clicked:
268
  'elapsed_time': elapsed_time
269
  }
270
 
271
- # Update score displays
272
- with col2:
273
- statistics_ph.text_input(
274
- label="Statistics",
275
- value=f"{crit:.6f}",
276
- disabled=True,
277
- help="Detection statistics will appear here after clicking Detect.",
278
- )
279
-
280
- pvalue_ph.text_input(
281
- label="p-value",
282
- value=f"{p_value:.6f}",
283
- disabled=True,
284
- help="p-value will appear here after clicking Detect.",
285
- )
286
-
287
- st.info(
288
- """
289
- **📊 p-value:**
290
- - **Lower p-value** (closer to 0) indicates text is **more likely AI-generated**
291
- - **Higher p-value** (closer to 1) indicates text is **more likely human-written**
292
- - Generally, p-value < 0.05 suggests the text may be LLM-generated
293
- """,
294
- icon="💡"
295
- )
296
-
297
- # ========== 🆕 Feedback buttons (moved here for better UX) ==========
298
- st.markdown("**📝 Result Feedback**: Does this detection result meet your expectations?")
299
-
300
- current_text = text_input
301
- current_domain = selected_domain
302
- current_statistics = crit
303
- current_pvalue = p_value
304
- feedback_col1, feedback_col2 = st.columns(2)
305
-
306
- with feedback_col1:
307
- if st.button("✅ Expected", use_container_width=True, type="secondary", key=f"expected_btn_{hash(text_input[:50])}"):
308
- try:
309
- success, message = feedback_manager.save_feedback(
310
- current_text,
311
- current_domain,
312
- current_statistics,
313
- current_pvalue,
314
- 'expected'
315
- )
316
- if success:
317
- st.success("✅ Thank you for your feedback!")
318
- st.caption(f"💾 {message}")
319
- else:
320
- st.error(f"Failed to save feedback: {message}")
321
- except Exception as e:
322
- st.error(f"Failed to save feedback: {str(e)}")
323
- import traceback
324
- st.code(traceback.format_exc())
325
-
326
- with feedback_col2:
327
- if st.button("❌ Unexpected", use_container_width=True, type="secondary", key=f"unexpected_btn_{hash(text_input[:50])}"):
328
- try:
329
- success, message = feedback_manager.save_feedback(
330
- current_text,
331
- current_domain,
332
- current_statistics,
333
- current_pvalue,
334
- 'unexpected'
335
- )
336
- if success:
337
- st.warning("❌ Feedback recorded! This will help us improve.")
338
- st.caption(f"💾 {message}")
339
- else:
340
- st.error(f"Failed to save feedback: {message}")
341
- except Exception as e:
342
- st.error(f"Failed to save feedback: {str(e)}")
343
- import traceback
344
- st.code(traceback.format_exc())
345
-
346
- if st.session_state.feedback_given:
347
- st.success("✅ Feedback submitted successfully!")
348
- # ============================================
349
 
350
  # Show detailed results
351
  with result_placeholder:
@@ -356,6 +359,101 @@ if detect_clicked:
356
  st.error(f"❌ Error during detection: {str(e)}")
357
  st.exception(e)
358
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
359
  # ========== 🆕 Citation and paper reference section ==========
360
  # st.markdown("---")
361
  # st.markdown(
 
33
  import streamlit as st
34
  from FineTune.model import ComputeStat
35
  import time
36
+ # 🆕 new imports
37
+ import json
38
+ import datetime
39
 
40
  # -----------------
41
  # Page Configuration
 
120
  feedback_manager = FeedbackManager(
121
  dataset_repo_id=FEEDBACK_DATASET_ID,
122
  hf_token=os.environ.get('HF_TOKEN'),
123
+ local_backup=False if os.environ.get('SPACE_ID') else True # 保留本地备份
124
  )
125
 
126
+ # 🆕 Incremental feedback saver for HF Spaces
127
+ IS_SPACE = bool(os.environ.get('SPACE_ID'))
128
+
129
+ @st.cache_resource
130
+ def get_feedback_repo(dataset_repo_id: str, hf_token: str):
131
+ if not IS_SPACE:
132
+ return None
133
+ try:
134
+ from huggingface_hub import login, Repository
135
+ if hf_token:
136
+ login(token=hf_token)
137
+ local_dir = Path('/tmp') / ('hf_ds_' + dataset_repo_id.replace('/', '__'))
138
+ local_dir.mkdir(parents=True, exist_ok=True)
139
+ repo = Repository(
140
+ local_dir=str(local_dir),
141
+ clone_from=dataset_repo_id,
142
+ repo_type="dataset",
143
+ token=hf_token,
144
+ )
145
+ return repo
146
+ except Exception as e:
147
+ print(f"[feedback repo] init failed: {e}")
148
+ return None
149
+
150
+ def save_feedback_incremental(text: str, domain: str, statistics: float, p_value: float, label: str):
151
+ """
152
+ Append a single feedback record to a date-sharded NDJSON file and push.
153
+ Falls back to FeedbackManager on error or non-space environments.
154
+ """
155
+ try:
156
+ repo = get_feedback_repo(FEEDBACK_DATASET_ID, os.environ.get('HF_TOKEN'))
157
+ if repo is None:
158
+ # Fallback (local or repo init failed)
159
+ return feedback_manager.save_feedback(text, domain, statistics, p_value, label)
160
+
161
+ # Pull latest, append, commit, push
162
+ try:
163
+ repo.git_pull(rebase=True)
164
+ except Exception as e:
165
+ print(f"[feedback repo] pull warning: {e}")
166
+
167
+ now = datetime.datetime.utcnow()
168
+ date_str = now.strftime("%Y-%m-%d")
169
+ payload = {
170
+ "timestamp": now.isoformat(timespec="seconds") + "Z",
171
+ "space_id": os.environ.get('SPACE_ID'),
172
+ "domain": domain,
173
+ "label": label,
174
+ "statistics": statistics,
175
+ "p_value": p_value,
176
+ "text": text,
177
+ "app_version": "adadetectgpt-app-1", # optional tag
178
+ }
179
+
180
+ feedback_dir = Path(repo.local_dir) / "feedback"
181
+ feedback_dir.mkdir(parents=True, exist_ok=True)
182
+ file_path = feedback_dir / f"{date_str}.ndjson"
183
+
184
+ with open(file_path, "a", encoding="utf-8") as f:
185
+ f.write(json.dumps(payload, ensure_ascii=False) + "\n")
186
+
187
+ # Commit and push only the changed file to minimize traffic
188
+ repo.git_add(pattern=str(file_path))
189
+ try:
190
+ repo.git_commit(f"feedback: append {file_path.name}")
191
+ except Exception as e:
192
+ # allow empty commit errors to pass silently if nothing changed
193
+ print(f"[feedback repo] commit info: {e}")
194
+ repo.git_push()
195
+ return True, f"Pushed to {FEEDBACK_DATASET_ID}:{file_path.name}"
196
+ except Exception as e:
197
+ # Final fallback if anything goes wrong
198
+ print(f"[feedback repo] incremental save failed: {e}")
199
+ try:
200
+ return feedback_manager.save_feedback(text, domain, statistics, p_value, label)
201
+ except Exception as e2:
202
+ return False, f"Fallback failed: {e2}"
203
+
204
  # -----------------
205
  # Configuration
206
  # -----------------
 
209
  'from_pretrained': './src/FineTune/ckpt/',
210
  'base_model': 'gemma-1b',
211
  'cache_dir': '../cache',
212
+ 'device': 'cpu' if os.environ.get('SPACE_ID') else 'mps',
 
213
  # 'device': 'cuda',
214
  }
215
 
 
278
  height=200,
279
  )
280
 
281
+ # Add a stable key to the Detect button
282
+ detect_clicked = st.button("Detect", type="primary", use_container_width=True, key="detect_btn")
283
 
284
  selected_domain = st.selectbox(
285
  label="⚙️ Domain (Optional)",
 
312
  if not text_input.strip():
313
  st.warning("⚠️ Please enter some text before detecting.")
314
  else:
 
315
  st.session_state.feedback_given = False
 
316
 
317
  # Start timing to decide whether to show progress bar
318
  start_time = time.time()
 
347
  'elapsed_time': elapsed_time
348
  }
349
 
350
+ # NOTE: Do not render results and feedback here; they are rendered below
351
+ # based on st.session_state.last_detection so buttons persist across reruns.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
352
 
353
  # Show detailed results
354
  with result_placeholder:
 
359
  st.error(f"❌ Error during detection: {str(e)}")
360
  st.exception(e)
361
 
362
+ # ================= Result & Feedback rendering (persistent across reruns) =================
363
+ if st.session_state.last_detection:
364
+ data = st.session_state.last_detection
365
+
366
+ with col2:
367
+ # Update score displays
368
+ statistics_ph.text_input(
369
+ label="Statistics",
370
+ value=f"{data['statistics']:.6f}",
371
+ disabled=True,
372
+ help="Detection statistics will appear here after clicking Detect.",
373
+ )
374
+ pvalue_ph.text_input(
375
+ label="p-value",
376
+ value=f"{data['p_value']:.6f}",
377
+ disabled=True,
378
+ help="p-value will appear here after clicking Detect.",
379
+ )
380
+
381
+ st.info(
382
+ """
383
+ **📊 p-value:**
384
+ - **Lower p-value** (closer to 0) indicates text is **more likely AI-generated**
385
+ - **Higher p-value** (closer to 1) indicates text is **more likely human-written**
386
+ - Generally, p-value < 0.05 suggests the text may be LLM-generated
387
+ """,
388
+ icon="💡"
389
+ )
390
+
391
+ st.markdown("**📝 Result Feedback**: Does this detection result meet your expectations?")
392
+
393
+ current_text = data['text']
394
+ current_domain = data['domain']
395
+ current_statistics = data['statistics']
396
+ current_pvalue = data['p_value']
397
+
398
+ feedback_col1, feedback_col2 = st.columns(2)
399
+
400
+ with feedback_col1:
401
+ # Add a stable, unique key so click state is captured on rerun
402
+ expected_click = st.button(
403
+ "✅ Expected",
404
+ use_container_width=True,
405
+ type="secondary",
406
+ key=f"expected_btn_{hash(current_text[:50])}"
407
+ )
408
+ print("--------------------------------------------------")
409
+ print(f"Expected button clicked: {expected_click}")
410
+ if expected_click and not st.session_state.feedback_given:
411
+ try:
412
+ # 🆕 use incremental saver (auto-fallbacks when needed)
413
+ success, message = save_feedback_incremental(
414
+ current_text,
415
+ current_domain,
416
+ current_statistics,
417
+ current_pvalue,
418
+ 'expected'
419
+ )
420
+ if success:
421
+ st.success("✅ Thanks for your positive feedback!")
422
+ st.session_state.feedback_given = True
423
+ else:
424
+ st.error(f"Failed to save feedback: {message}")
425
+ except Exception as e:
426
+ st.error(f"Failed to save feedback: {str(e)}")
427
+ import traceback
428
+ st.code(traceback.format_exc())
429
+
430
+ with feedback_col2:
431
+ unexpected_click = st.button(
432
+ "❌ Unexpected",
433
+ use_container_width=True,
434
+ type="secondary",
435
+ key=f"unexpected_btn_{hash(current_text[:50])}"
436
+ )
437
+ if unexpected_click and not st.session_state.feedback_given:
438
+ try:
439
+ # 🆕 use incremental saver (auto-fallbacks when needed)
440
+ success, message = save_feedback_incremental(
441
+ current_text,
442
+ current_domain,
443
+ current_statistics,
444
+ current_pvalue,
445
+ 'unexpected'
446
+ )
447
+ if success:
448
+ st.warning("Feedback recorded! This will help us improve.")
449
+ st.session_state.feedback_given = True
450
+ else:
451
+ st.error(f"Failed to save feedback: {message}")
452
+ except Exception as e:
453
+ st.error(f"Failed to save feedback: {str(e)}")
454
+ import traceback
455
+ st.code(traceback.format_exc())
456
+
457
  # ========== 🆕 Citation and paper reference section ==========
458
  # st.markdown("---")
459
  # st.markdown(
src/feedback.py CHANGED
@@ -6,6 +6,60 @@ from huggingface_hub import HfApi, upload_file, hf_hub_download
6
  from typing import Optional
7
  import pandas as pd
8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9
  class FeedbackManager:
10
  """管理用户反馈,支持保存到 Hugging Face 私有数据集"""
11
 
@@ -31,7 +85,6 @@ class FeedbackManager:
31
  if self.dataset_repo_id and self.hf_token:
32
  self.api = HfApi(token=self.hf_token)
33
  # 确保数据集存在
34
- self._ensure_dataset_exists()
35
  else:
36
  self.api = None
37
  print("⚠️ No HF dataset configured. Will only save locally.")
@@ -44,58 +97,7 @@ class FeedbackManager:
44
 
45
  self.local_dir.mkdir(exist_ok=True, parents=True)
46
  self.local_file = self.local_dir / 'user_feedback.json'
47
-
48
- def _ensure_dataset_exists(self):
49
- """确保 HF 数据集存在,如果不存在则创建"""
50
- try:
51
- from huggingface_hub import create_repo
52
- # 尝试创建数据集仓库(如果已存在会抛出异常)
53
- try:
54
- create_repo(
55
- repo_id=self.dataset_repo_id,
56
- token=self.hf_token,
57
- private=True,
58
- repo_type="dataset"
59
- )
60
- print(f"✅ Created new private dataset: {self.dataset_repo_id}")
61
-
62
- # 创建初始的 README.md
63
- readme_content = f"""---
64
- license: mit
65
- ---
66
-
67
- # AdaDetectGPT User Feedback Dataset
68
-
69
- This dataset contains user feedback from the AdaDetectGPT detection system.
70
-
71
- ## Data Format
72
-
73
- Each entry contains:
74
- - `timestamp`: When the feedback was submitted
75
- - `text`: The text that was analyzed
76
- - `domain`: The domain selected for analysis
77
- - `statistics`: The computed statistics value
78
- - `p_value`: The p-value from the detection
79
- - `feedback`: User feedback (expected/unexpected)
80
- """
81
- readme_file = self.local_dir / 'README.md'
82
- readme_file.write_text(readme_content)
83
-
84
- upload_file(
85
- path_or_fileobj=str(readme_file),
86
- path_in_repo="README.md",
87
- repo_id=self.dataset_repo_id,
88
- repo_type="dataset",
89
- token=self.hf_token
90
- )
91
-
92
- except Exception as e:
93
- if "already exists" not in str(e):
94
- print(f"⚠️ Dataset check: {e}")
95
-
96
- except Exception as e:
97
- print(f"⚠️ Could not verify dataset: {e}")
98
-
99
  def _load_existing_data(self) -> list:
100
  """从 HF 数据集加载现有数据"""
101
  existing_data = []
 
6
  from typing import Optional
7
  import pandas as pd
8
 
9
+ def save_feedback_incremental(text: str, domain: str, statistics: float, p_value: float, label: str):
10
+ """
11
+ Append a single feedback record to a date-sharded NDJSON file and push.
12
+ Falls back to FeedbackManager on error or non-space environments.
13
+ """
14
+ try:
15
+ repo = get_feedback_repo(FEEDBACK_DATASET_ID, os.environ.get('HF_TOKEN'))
16
+ if repo is None:
17
+ # Fallback (local or repo init failed)
18
+ return feedback_manager.save_feedback(text, domain, statistics, p_value, label)
19
+
20
+ # Pull latest, append, commit, push
21
+ try:
22
+ repo.git_pull(rebase=True)
23
+ except Exception as e:
24
+ print(f"[feedback repo] pull warning: {e}")
25
+
26
+ now = datetime.datetime.utcnow()
27
+ date_str = now.strftime("%Y-%m-%d")
28
+ payload = {
29
+ "timestamp": now.isoformat(timespec="seconds") + "Z",
30
+ "space_id": os.environ.get('SPACE_ID'),
31
+ "domain": domain,
32
+ "label": label,
33
+ "statistics": statistics,
34
+ "p_value": p_value,
35
+ "text": text,
36
+ "app_version": "adadetectgpt-app-1", # optional tag
37
+ }
38
+
39
+ feedback_dir = Path(repo.local_dir) / "feedback"
40
+ feedback_dir.mkdir(parents=True, exist_ok=True)
41
+ file_path = feedback_dir / f"{date_str}.ndjson"
42
+
43
+ with open(file_path, "a", encoding="utf-8") as f:
44
+ f.write(json.dumps(payload, ensure_ascii=False) + "\n")
45
+
46
+ # Commit and push only the changed file to minimize traffic
47
+ repo.git_add(pattern=str(file_path))
48
+ try:
49
+ repo.git_commit(f"feedback: append {file_path.name}")
50
+ except Exception as e:
51
+ # allow empty commit errors to pass silently if nothing changed
52
+ print(f"[feedback repo] commit info: {e}")
53
+ repo.git_push()
54
+ return True, f"Pushed to {FEEDBACK_DATASET_ID}:{file_path.name}"
55
+ except Exception as e:
56
+ # Final fallback if anything goes wrong
57
+ print(f"[feedback repo] incremental save failed: {e}")
58
+ try:
59
+ return feedback_manager.save_feedback(text, domain, statistics, p_value, label)
60
+ except Exception as e2:
61
+ return False, f"Fallback failed: {e2}"
62
+
63
  class FeedbackManager:
64
  """管理用户反馈,支持保存到 Hugging Face 私有数据集"""
65
 
 
85
  if self.dataset_repo_id and self.hf_token:
86
  self.api = HfApi(token=self.hf_token)
87
  # 确保数据集存在
 
88
  else:
89
  self.api = None
90
  print("⚠️ No HF dataset configured. Will only save locally.")
 
97
 
98
  self.local_dir.mkdir(exist_ok=True, parents=True)
99
  self.local_file = self.local_dir / 'user_feedback.json'
100
+
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
101
  def _load_existing_data(self) -> list:
102
  """从 HF 数据集加载现有数据"""
103
  existing_data = []