Jin Zhu commited on
Commit
30f85d9
·
1 Parent(s): 0fa0487

Revert "update"

Browse files

This reverts commit 0fa048740e502ad73fe90368dd05ae943866bbdf.

Files changed (4) hide show
  1. .gitignore +1 -3
  2. src/FineTune/.gitignore +1 -0
  3. src/app.py +84 -182
  4. src/feedback.py +53 -55
.gitignore CHANGED
@@ -1,3 +1 @@
1
- cache/*
2
- src/feedback_data/*
3
- src/__pycache__/*
 
1
+ cache/*
 
 
src/FineTune/.gitignore CHANGED
@@ -8,6 +8,7 @@ __pycache__/
8
 
9
  # Distribution / packaging
10
  .Python
 
11
  logs/*/
12
  models/*/
13
  build/
 
8
 
9
  # Distribution / packaging
10
  .Python
11
+ ckpt/*
12
  logs/*/
13
  models/*/
14
  build/
src/app.py CHANGED
@@ -33,9 +33,6 @@ if os.environ.get('SPACE_ID'):
33
  import streamlit as st
34
  from FineTune.model import ComputeStat
35
  import time
36
- # 🆕 new imports
37
- import json
38
- import datetime
39
 
40
  # -----------------
41
  # Page Configuration
@@ -120,87 +117,9 @@ FEEDBACK_DATASET_ID = os.environ.get('FEEDBACK_DATASET_ID', 'mamba413/user-feedb
120
  feedback_manager = FeedbackManager(
121
  dataset_repo_id=FEEDBACK_DATASET_ID,
122
  hf_token=os.environ.get('HF_TOKEN'),
123
- local_backup=False if os.environ.get('SPACE_ID') else True # 保留本地备份
124
  )
125
 
126
- # 🆕 Incremental feedback saver for HF Spaces
127
- IS_SPACE = bool(os.environ.get('SPACE_ID'))
128
-
129
- @st.cache_resource
130
- def get_feedback_repo(dataset_repo_id: str, hf_token: str):
131
- if not IS_SPACE:
132
- return None
133
- try:
134
- from huggingface_hub import login, Repository
135
- if hf_token:
136
- login(token=hf_token)
137
- local_dir = Path('/tmp') / ('hf_ds_' + dataset_repo_id.replace('/', '__'))
138
- local_dir.mkdir(parents=True, exist_ok=True)
139
- repo = Repository(
140
- local_dir=str(local_dir),
141
- clone_from=dataset_repo_id,
142
- repo_type="dataset",
143
- token=hf_token,
144
- )
145
- return repo
146
- except Exception as e:
147
- print(f"[feedback repo] init failed: {e}")
148
- return None
149
-
150
- def save_feedback_incremental(text: str, domain: str, statistics: float, p_value: float, label: str):
151
- """
152
- Append a single feedback record to a date-sharded NDJSON file and push.
153
- Falls back to FeedbackManager on error or non-space environments.
154
- """
155
- try:
156
- repo = get_feedback_repo(FEEDBACK_DATASET_ID, os.environ.get('HF_TOKEN'))
157
- if repo is None:
158
- # Fallback (local or repo init failed)
159
- return feedback_manager.save_feedback(text, domain, statistics, p_value, label)
160
-
161
- # Pull latest, append, commit, push
162
- try:
163
- repo.git_pull(rebase=True)
164
- except Exception as e:
165
- print(f"[feedback repo] pull warning: {e}")
166
-
167
- now = datetime.datetime.utcnow()
168
- date_str = now.strftime("%Y-%m-%d")
169
- payload = {
170
- "timestamp": now.isoformat(timespec="seconds") + "Z",
171
- "space_id": os.environ.get('SPACE_ID'),
172
- "domain": domain,
173
- "label": label,
174
- "statistics": statistics,
175
- "p_value": p_value,
176
- "text": text,
177
- "app_version": "adadetectgpt-app-1", # optional tag
178
- }
179
-
180
- feedback_dir = Path(repo.local_dir) / "feedback"
181
- feedback_dir.mkdir(parents=True, exist_ok=True)
182
- file_path = feedback_dir / f"{date_str}.ndjson"
183
-
184
- with open(file_path, "a", encoding="utf-8") as f:
185
- f.write(json.dumps(payload, ensure_ascii=False) + "\n")
186
-
187
- # Commit and push only the changed file to minimize traffic
188
- repo.git_add(pattern=str(file_path))
189
- try:
190
- repo.git_commit(f"feedback: append {file_path.name}")
191
- except Exception as e:
192
- # allow empty commit errors to pass silently if nothing changed
193
- print(f"[feedback repo] commit info: {e}")
194
- repo.git_push()
195
- return True, f"Pushed to {FEEDBACK_DATASET_ID}:{file_path.name}"
196
- except Exception as e:
197
- # Final fallback if anything goes wrong
198
- print(f"[feedback repo] incremental save failed: {e}")
199
- try:
200
- return feedback_manager.save_feedback(text, domain, statistics, p_value, label)
201
- except Exception as e2:
202
- return False, f"Fallback failed: {e2}"
203
-
204
  # -----------------
205
  # Configuration
206
  # -----------------
@@ -209,7 +128,8 @@ MODEL_CONFIG = {
209
  'from_pretrained': './src/FineTune/ckpt/',
210
  'base_model': 'gemma-1b',
211
  'cache_dir': '../cache',
212
- 'device': 'cpu' if os.environ.get('SPACE_ID') else 'mps',
 
213
  # 'device': 'cuda',
214
  }
215
 
@@ -278,8 +198,7 @@ with col1:
278
  height=200,
279
  )
280
 
281
- # Add a stable key to the Detect button
282
- detect_clicked = st.button("Detect", type="primary", use_container_width=True, key="detect_btn")
283
 
284
  selected_domain = st.selectbox(
285
  label="⚙️ Domain (Optional)",
@@ -312,7 +231,9 @@ if detect_clicked:
312
  if not text_input.strip():
313
  st.warning("⚠️ Please enter some text before detecting.")
314
  else:
 
315
  st.session_state.feedback_given = False
 
316
 
317
  # Start timing to decide whether to show progress bar
318
  start_time = time.time()
@@ -347,8 +268,84 @@ if detect_clicked:
347
  'elapsed_time': elapsed_time
348
  }
349
 
350
- # NOTE: Do not render results and feedback here; they are rendered below
351
- # based on st.session_state.last_detection so buttons persist across reruns.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
352
 
353
  # Show detailed results
354
  with result_placeholder:
@@ -359,101 +356,6 @@ if detect_clicked:
359
  st.error(f"❌ Error during detection: {str(e)}")
360
  st.exception(e)
361
 
362
- # ================= Result & Feedback rendering (persistent across reruns) =================
363
- if st.session_state.last_detection:
364
- data = st.session_state.last_detection
365
-
366
- with col2:
367
- # Update score displays
368
- statistics_ph.text_input(
369
- label="Statistics",
370
- value=f"{data['statistics']:.6f}",
371
- disabled=True,
372
- help="Detection statistics will appear here after clicking Detect.",
373
- )
374
- pvalue_ph.text_input(
375
- label="p-value",
376
- value=f"{data['p_value']:.6f}",
377
- disabled=True,
378
- help="p-value will appear here after clicking Detect.",
379
- )
380
-
381
- st.info(
382
- """
383
- **📊 p-value:**
384
- - **Lower p-value** (closer to 0) indicates text is **more likely AI-generated**
385
- - **Higher p-value** (closer to 1) indicates text is **more likely human-written**
386
- - Generally, p-value < 0.05 suggests the text may be LLM-generated
387
- """,
388
- icon="💡"
389
- )
390
-
391
- st.markdown("**📝 Result Feedback**: Does this detection result meet your expectations?")
392
-
393
- current_text = data['text']
394
- current_domain = data['domain']
395
- current_statistics = data['statistics']
396
- current_pvalue = data['p_value']
397
-
398
- feedback_col1, feedback_col2 = st.columns(2)
399
-
400
- with feedback_col1:
401
- # Add a stable, unique key so click state is captured on rerun
402
- expected_click = st.button(
403
- "✅ Expected",
404
- use_container_width=True,
405
- type="secondary",
406
- key=f"expected_btn_{hash(current_text[:50])}"
407
- )
408
- print("--------------------------------------------------")
409
- print(f"Expected button clicked: {expected_click}")
410
- if expected_click and not st.session_state.feedback_given:
411
- try:
412
- # 🆕 use incremental saver (auto-fallbacks when needed)
413
- success, message = save_feedback_incremental(
414
- current_text,
415
- current_domain,
416
- current_statistics,
417
- current_pvalue,
418
- 'expected'
419
- )
420
- if success:
421
- st.success("✅ Thanks for your positive feedback!")
422
- st.session_state.feedback_given = True
423
- else:
424
- st.error(f"Failed to save feedback: {message}")
425
- except Exception as e:
426
- st.error(f"Failed to save feedback: {str(e)}")
427
- import traceback
428
- st.code(traceback.format_exc())
429
-
430
- with feedback_col2:
431
- unexpected_click = st.button(
432
- "❌ Unexpected",
433
- use_container_width=True,
434
- type="secondary",
435
- key=f"unexpected_btn_{hash(current_text[:50])}"
436
- )
437
- if unexpected_click and not st.session_state.feedback_given:
438
- try:
439
- # 🆕 use incremental saver (auto-fallbacks when needed)
440
- success, message = save_feedback_incremental(
441
- current_text,
442
- current_domain,
443
- current_statistics,
444
- current_pvalue,
445
- 'unexpected'
446
- )
447
- if success:
448
- st.warning("Feedback recorded! This will help us improve.")
449
- st.session_state.feedback_given = True
450
- else:
451
- st.error(f"Failed to save feedback: {message}")
452
- except Exception as e:
453
- st.error(f"Failed to save feedback: {str(e)}")
454
- import traceback
455
- st.code(traceback.format_exc())
456
-
457
  # ========== 🆕 Citation and paper reference section ==========
458
  # st.markdown("---")
459
  # st.markdown(
 
33
  import streamlit as st
34
  from FineTune.model import ComputeStat
35
  import time
 
 
 
36
 
37
  # -----------------
38
  # Page Configuration
 
117
  feedback_manager = FeedbackManager(
118
  dataset_repo_id=FEEDBACK_DATASET_ID,
119
  hf_token=os.environ.get('HF_TOKEN'),
120
+ local_backup=True # 保留本地备份
121
  )
122
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
123
  # -----------------
124
  # Configuration
125
  # -----------------
 
128
  'from_pretrained': './src/FineTune/ckpt/',
129
  'base_model': 'gemma-1b',
130
  'cache_dir': '../cache',
131
+ # 'device': 'mps',
132
+ 'device': 'cpu',
133
  # 'device': 'cuda',
134
  }
135
 
 
198
  height=200,
199
  )
200
 
201
+ detect_clicked = st.button("Detect", type="primary", use_container_width=True)
 
202
 
203
  selected_domain = st.selectbox(
204
  label="⚙️ Domain (Optional)",
 
231
  if not text_input.strip():
232
  st.warning("⚠️ Please enter some text before detecting.")
233
  else:
234
+ # ========== Reset feedback state ==========
235
  st.session_state.feedback_given = False
236
+ # ==========================================
237
 
238
  # Start timing to decide whether to show progress bar
239
  start_time = time.time()
 
268
  'elapsed_time': elapsed_time
269
  }
270
 
271
+ # Update score displays
272
+ with col2:
273
+ statistics_ph.text_input(
274
+ label="Statistics",
275
+ value=f"{crit:.6f}",
276
+ disabled=True,
277
+ help="Detection statistics will appear here after clicking Detect.",
278
+ )
279
+
280
+ pvalue_ph.text_input(
281
+ label="p-value",
282
+ value=f"{p_value:.6f}",
283
+ disabled=True,
284
+ help="p-value will appear here after clicking Detect.",
285
+ )
286
+
287
+ st.info(
288
+ """
289
+ **📊 p-value:**
290
+ - **Lower p-value** (closer to 0) indicates text is **more likely AI-generated**
291
+ - **Higher p-value** (closer to 1) indicates text is **more likely human-written**
292
+ - Generally, p-value < 0.05 suggests the text may be LLM-generated
293
+ """,
294
+ icon="💡"
295
+ )
296
+
297
+ # ========== 🆕 Feedback buttons (moved here for better UX) ==========
298
+ st.markdown("**📝 Result Feedback**: Does this detection result meet your expectations?")
299
+
300
+ current_text = text_input
301
+ current_domain = selected_domain
302
+ current_statistics = crit
303
+ current_pvalue = p_value
304
+ feedback_col1, feedback_col2 = st.columns(2)
305
+
306
+ with feedback_col1:
307
+ if st.button("✅ Expected", use_container_width=True, type="secondary", key=f"expected_btn_{hash(text_input[:50])}"):
308
+ try:
309
+ success, message = feedback_manager.save_feedback(
310
+ current_text,
311
+ current_domain,
312
+ current_statistics,
313
+ current_pvalue,
314
+ 'expected'
315
+ )
316
+ if success:
317
+ st.success("✅ Thank you for your feedback!")
318
+ st.caption(f"💾 {message}")
319
+ else:
320
+ st.error(f"Failed to save feedback: {message}")
321
+ except Exception as e:
322
+ st.error(f"Failed to save feedback: {str(e)}")
323
+ import traceback
324
+ st.code(traceback.format_exc())
325
+
326
+ with feedback_col2:
327
+ if st.button("❌ Unexpected", use_container_width=True, type="secondary", key=f"unexpected_btn_{hash(text_input[:50])}"):
328
+ try:
329
+ success, message = feedback_manager.save_feedback(
330
+ current_text,
331
+ current_domain,
332
+ current_statistics,
333
+ current_pvalue,
334
+ 'unexpected'
335
+ )
336
+ if success:
337
+ st.warning("❌ Feedback recorded! This will help us improve.")
338
+ st.caption(f"💾 {message}")
339
+ else:
340
+ st.error(f"Failed to save feedback: {message}")
341
+ except Exception as e:
342
+ st.error(f"Failed to save feedback: {str(e)}")
343
+ import traceback
344
+ st.code(traceback.format_exc())
345
+
346
+ if st.session_state.feedback_given:
347
+ st.success("✅ Feedback submitted successfully!")
348
+ # ============================================
349
 
350
  # Show detailed results
351
  with result_placeholder:
 
356
  st.error(f"❌ Error during detection: {str(e)}")
357
  st.exception(e)
358
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
359
  # ========== 🆕 Citation and paper reference section ==========
360
  # st.markdown("---")
361
  # st.markdown(
src/feedback.py CHANGED
@@ -6,60 +6,6 @@ from huggingface_hub import HfApi, upload_file, hf_hub_download
6
  from typing import Optional
7
  import pandas as pd
8
 
9
- def save_feedback_incremental(text: str, domain: str, statistics: float, p_value: float, label: str):
10
- """
11
- Append a single feedback record to a date-sharded NDJSON file and push.
12
- Falls back to FeedbackManager on error or non-space environments.
13
- """
14
- try:
15
- repo = get_feedback_repo(FEEDBACK_DATASET_ID, os.environ.get('HF_TOKEN'))
16
- if repo is None:
17
- # Fallback (local or repo init failed)
18
- return feedback_manager.save_feedback(text, domain, statistics, p_value, label)
19
-
20
- # Pull latest, append, commit, push
21
- try:
22
- repo.git_pull(rebase=True)
23
- except Exception as e:
24
- print(f"[feedback repo] pull warning: {e}")
25
-
26
- now = datetime.datetime.utcnow()
27
- date_str = now.strftime("%Y-%m-%d")
28
- payload = {
29
- "timestamp": now.isoformat(timespec="seconds") + "Z",
30
- "space_id": os.environ.get('SPACE_ID'),
31
- "domain": domain,
32
- "label": label,
33
- "statistics": statistics,
34
- "p_value": p_value,
35
- "text": text,
36
- "app_version": "adadetectgpt-app-1", # optional tag
37
- }
38
-
39
- feedback_dir = Path(repo.local_dir) / "feedback"
40
- feedback_dir.mkdir(parents=True, exist_ok=True)
41
- file_path = feedback_dir / f"{date_str}.ndjson"
42
-
43
- with open(file_path, "a", encoding="utf-8") as f:
44
- f.write(json.dumps(payload, ensure_ascii=False) + "\n")
45
-
46
- # Commit and push only the changed file to minimize traffic
47
- repo.git_add(pattern=str(file_path))
48
- try:
49
- repo.git_commit(f"feedback: append {file_path.name}")
50
- except Exception as e:
51
- # allow empty commit errors to pass silently if nothing changed
52
- print(f"[feedback repo] commit info: {e}")
53
- repo.git_push()
54
- return True, f"Pushed to {FEEDBACK_DATASET_ID}:{file_path.name}"
55
- except Exception as e:
56
- # Final fallback if anything goes wrong
57
- print(f"[feedback repo] incremental save failed: {e}")
58
- try:
59
- return feedback_manager.save_feedback(text, domain, statistics, p_value, label)
60
- except Exception as e2:
61
- return False, f"Fallback failed: {e2}"
62
-
63
  class FeedbackManager:
64
  """管理用户反馈,支持保存到 Hugging Face 私有数据集"""
65
 
@@ -85,6 +31,7 @@ class FeedbackManager:
85
  if self.dataset_repo_id and self.hf_token:
86
  self.api = HfApi(token=self.hf_token)
87
  # 确保数据集存在
 
88
  else:
89
  self.api = None
90
  print("⚠️ No HF dataset configured. Will only save locally.")
@@ -97,7 +44,58 @@ class FeedbackManager:
97
 
98
  self.local_dir.mkdir(exist_ok=True, parents=True)
99
  self.local_file = self.local_dir / 'user_feedback.json'
100
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
101
  def _load_existing_data(self) -> list:
102
  """从 HF 数据集加载现有数据"""
103
  existing_data = []
 
6
  from typing import Optional
7
  import pandas as pd
8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9
  class FeedbackManager:
10
  """管理用户反馈,支持保存到 Hugging Face 私有数据集"""
11
 
 
31
  if self.dataset_repo_id and self.hf_token:
32
  self.api = HfApi(token=self.hf_token)
33
  # 确保数据集存在
34
+ self._ensure_dataset_exists()
35
  else:
36
  self.api = None
37
  print("⚠️ No HF dataset configured. Will only save locally.")
 
44
 
45
  self.local_dir.mkdir(exist_ok=True, parents=True)
46
  self.local_file = self.local_dir / 'user_feedback.json'
47
+
48
+ def _ensure_dataset_exists(self):
49
+ """确保 HF 数据集存在,如果不存在则创建"""
50
+ try:
51
+ from huggingface_hub import create_repo
52
+ # 尝试创建数据集仓库(如果已存在会抛出异常)
53
+ try:
54
+ create_repo(
55
+ repo_id=self.dataset_repo_id,
56
+ token=self.hf_token,
57
+ private=True,
58
+ repo_type="dataset"
59
+ )
60
+ print(f"✅ Created new private dataset: {self.dataset_repo_id}")
61
+
62
+ # 创建初始的 README.md
63
+ readme_content = f"""---
64
+ license: mit
65
+ ---
66
+
67
+ # AdaDetectGPT User Feedback Dataset
68
+
69
+ This dataset contains user feedback from the AdaDetectGPT detection system.
70
+
71
+ ## Data Format
72
+
73
+ Each entry contains:
74
+ - `timestamp`: When the feedback was submitted
75
+ - `text`: The text that was analyzed
76
+ - `domain`: The domain selected for analysis
77
+ - `statistics`: The computed statistics value
78
+ - `p_value`: The p-value from the detection
79
+ - `feedback`: User feedback (expected/unexpected)
80
+ """
81
+ readme_file = self.local_dir / 'README.md'
82
+ readme_file.write_text(readme_content)
83
+
84
+ upload_file(
85
+ path_or_fileobj=str(readme_file),
86
+ path_in_repo="README.md",
87
+ repo_id=self.dataset_repo_id,
88
+ repo_type="dataset",
89
+ token=self.hf_token
90
+ )
91
+
92
+ except Exception as e:
93
+ if "already exists" not in str(e):
94
+ print(f"⚠️ Dataset check: {e}")
95
+
96
+ except Exception as e:
97
+ print(f"⚠️ Could not verify dataset: {e}")
98
+
99
  def _load_existing_data(self) -> list:
100
  """从 HF 数据集加载现有数据"""
101
  existing_data = []