Jin Zhu commited on
Commit
3ef9054
·
1 Parent(s): b94233a

update data saving

Browse files
Files changed (2) hide show
  1. src/app.py +24 -52
  2. src/feedback.py +272 -0
src/app.py CHANGED
@@ -105,54 +105,20 @@ def load_model(from_pretrained, base_model, cache_dir, device):
105
  model.set_criterion_fn('mean')
106
  return model
107
 
108
- import json
109
- from datetime import datetime
110
-
111
  # -----------------
112
- # Result Feedback
113
  # -----------------
114
- def save_feedback(text, domain, statistics, p_value, feedback_type):
115
- """
116
- 保存用户反馈到 JSON 文件
117
- feedback_type: 'expected' 'unexpected'
118
- """
119
- # 确定保存路径(根据环境选择)
120
- if os.environ.get('SPACE_ID'):
121
- feedback_dir = Path('/tmp/feedback_data')
122
- else:
123
- feedback_dir = APP_DIR / 'feedback_data'
124
-
125
- feedback_dir.mkdir(exist_ok=True, parents=True)
126
- feedback_file = feedback_dir / 'user_feedback.json'
127
-
128
- # 准备反馈数据
129
- feedback_entry = {
130
- 'timestamp': datetime.now().isoformat(),
131
- 'text': text,
132
- 'domain': domain,
133
- 'statistics': float(statistics),
134
- 'p_value': float(p_value),
135
- 'feedback': feedback_type
136
- }
137
-
138
- # 读取现有数据
139
- if feedback_file.exists():
140
- try:
141
- with open(feedback_file, 'r', encoding='utf-8') as f:
142
- feedback_data = json.load(f)
143
- except:
144
- feedback_data = []
145
- else:
146
- feedback_data = []
147
-
148
- # 添加新反馈
149
- feedback_data.append(feedback_entry)
150
-
151
- # 保存到文件
152
- with open(feedback_file, 'w', encoding='utf-8') as f:
153
- json.dump(feedback_data, f, ensure_ascii=False, indent=2)
154
-
155
- return feedback_file
156
 
157
  # -----------------
158
  # Configuration
@@ -340,15 +306,18 @@ if detect_clicked:
340
  with feedback_col1:
341
  if st.button("✅ Expected", use_container_width=True, type="secondary", key=f"expected_btn_{hash(text_input[:50])}"):
342
  try:
343
- feedback_file = save_feedback(
344
  current_text,
345
  current_domain,
346
  current_statistics,
347
  current_pvalue,
348
  'expected'
349
  )
350
- st.success("✅ Thank you for your feedback!")
351
- st.caption(f"💾 Saved to: `{feedback_file.name}`")
 
 
 
352
  except Exception as e:
353
  st.error(f"Failed to save feedback: {str(e)}")
354
  import traceback
@@ -357,15 +326,18 @@ if detect_clicked:
357
  with feedback_col2:
358
  if st.button("❌ Unexpected", use_container_width=True, type="secondary", key=f"unexpected_btn_{hash(text_input[:50])}"):
359
  try:
360
- feedback_file = save_feedback(
361
  current_text,
362
  current_domain,
363
  current_statistics,
364
  current_pvalue,
365
  'unexpected'
366
  )
367
- st.warning("❌ Feedback recorded! This will help us improve.")
368
- st.caption(f"💾 Saved to: `{feedback_file.name}`")
 
 
 
369
  except Exception as e:
370
  st.error(f"Failed to save feedback: {str(e)}")
371
  import traceback
 
105
  model.set_criterion_fn('mean')
106
  return model
107
 
 
 
 
108
  # -----------------
109
+ # Result Feedback Module Import
110
  # -----------------
111
+ from feedback import FeedbackManager
112
+
113
+ # Initialize Feedback Manager with HF dataset
114
+ # 请将 'your-username/your-dataset-name' 替换为您的实际 HF 数据集仓库 ID
115
+ # 确保在环境变量中设置了 HF_TOKEN 以访问私有数据集
116
+ FEEDBACK_DATASET_ID = os.environ.get('FEEDBACK_DATASET_ID', 'mamba413/user-feedback')
117
+ feedback_manager = FeedbackManager(
118
+ dataset_repo_id=FEEDBACK_DATASET_ID,
119
+ hf_token=os.environ.get('HF_TOKEN'),
120
+ local_backup=True # 保留本地备份
121
+ )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
122
 
123
  # -----------------
124
  # Configuration
 
306
  with feedback_col1:
307
  if st.button("✅ Expected", use_container_width=True, type="secondary", key=f"expected_btn_{hash(text_input[:50])}"):
308
  try:
309
+ success, message = feedback_manager.save_feedback(
310
  current_text,
311
  current_domain,
312
  current_statistics,
313
  current_pvalue,
314
  'expected'
315
  )
316
+ if success:
317
+ st.success(" Thank you for your feedback!")
318
+ st.caption(f"💾 {message}")
319
+ else:
320
+ st.error(f"Failed to save feedback: {message}")
321
  except Exception as e:
322
  st.error(f"Failed to save feedback: {str(e)}")
323
  import traceback
 
326
  with feedback_col2:
327
  if st.button("❌ Unexpected", use_container_width=True, type="secondary", key=f"unexpected_btn_{hash(text_input[:50])}"):
328
  try:
329
+ success, message = feedback_manager.save_feedback(
330
  current_text,
331
  current_domain,
332
  current_statistics,
333
  current_pvalue,
334
  'unexpected'
335
  )
336
+ if success:
337
+ st.warning(" Feedback recorded! This will help us improve.")
338
+ st.caption(f"💾 {message}")
339
+ else:
340
+ st.error(f"Failed to save feedback: {message}")
341
  except Exception as e:
342
  st.error(f"Failed to save feedback: {str(e)}")
343
  import traceback
src/feedback.py ADDED
@@ -0,0 +1,272 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import json
3
+ from datetime import datetime
4
+ from pathlib import Path
5
+ from huggingface_hub import HfApi, upload_file, hf_hub_download
6
+ from typing import Optional
7
+ import pandas as pd
8
+
9
+ class FeedbackManager:
10
+ """管理用户反馈,支持保存到 Hugging Face 私有数据集"""
11
+
12
+ def __init__(
13
+ self,
14
+ dataset_repo_id: str = None,
15
+ hf_token: str = None,
16
+ local_backup: bool = True
17
+ ):
18
+ """
19
+ 初始化 FeedbackManager
20
+
21
+ Args:
22
+ dataset_repo_id: Hugging Face 数据集仓库 ID (格式: username/dataset-name)
23
+ hf_token: Hugging Face API token (用于私有数据集)
24
+ local_backup: 是否在本地保留备份
25
+ """
26
+ self.dataset_repo_id = dataset_repo_id
27
+ self.hf_token = hf_token or os.environ.get('HF_TOKEN')
28
+ self.local_backup = local_backup
29
+
30
+ # 初始化 HF API
31
+ if self.dataset_repo_id and self.hf_token:
32
+ self.api = HfApi(token=self.hf_token)
33
+ # 确保数据集存在
34
+ self._ensure_dataset_exists()
35
+ else:
36
+ self.api = None
37
+ print("⚠️ No HF dataset configured. Will only save locally.")
38
+
39
+ # 设置本地存储路径
40
+ if os.environ.get('SPACE_ID'):
41
+ self.local_dir = Path('/tmp/feedback_data')
42
+ else:
43
+ self.local_dir = Path(__file__).parent / 'feedback_data'
44
+
45
+ self.local_dir.mkdir(exist_ok=True, parents=True)
46
+ self.local_file = self.local_dir / 'user_feedback.json'
47
+
48
+ def _ensure_dataset_exists(self):
49
+ """确保 HF 数据集存在,如果不存在则创建"""
50
+ try:
51
+ from huggingface_hub import create_repo
52
+ # 尝试创建数据集仓库(如果已存在会抛出异常)
53
+ try:
54
+ create_repo(
55
+ repo_id=self.dataset_repo_id,
56
+ token=self.hf_token,
57
+ private=True,
58
+ repo_type="dataset"
59
+ )
60
+ print(f"✅ Created new private dataset: {self.dataset_repo_id}")
61
+
62
+ # 创建初始的 README.md
63
+ readme_content = f"""---
64
+ license: mit
65
+ ---
66
+
67
+ # AdaDetectGPT User Feedback Dataset
68
+
69
+ This dataset contains user feedback from the AdaDetectGPT detection system.
70
+
71
+ ## Data Format
72
+
73
+ Each entry contains:
74
+ - `timestamp`: When the feedback was submitted
75
+ - `text`: The text that was analyzed
76
+ - `domain`: The domain selected for analysis
77
+ - `statistics`: The computed statistics value
78
+ - `p_value`: The p-value from the detection
79
+ - `feedback`: User feedback (expected/unexpected)
80
+ """
81
+ readme_file = self.local_dir / 'README.md'
82
+ readme_file.write_text(readme_content)
83
+
84
+ upload_file(
85
+ path_or_fileobj=str(readme_file),
86
+ path_in_repo="README.md",
87
+ repo_id=self.dataset_repo_id,
88
+ repo_type="dataset",
89
+ token=self.hf_token
90
+ )
91
+
92
+ except Exception as e:
93
+ if "already exists" not in str(e):
94
+ print(f"⚠️ Dataset check: {e}")
95
+
96
+ except Exception as e:
97
+ print(f"⚠️ Could not verify dataset: {e}")
98
+
99
+ def _load_existing_data(self) -> list:
100
+ """从 HF 数据集加载现有数据"""
101
+ existing_data = []
102
+
103
+ # 首先尝试从 HF 数据集加载
104
+ if self.api and self.dataset_repo_id:
105
+ try:
106
+ # 下载现有的反馈文件
107
+ local_path = hf_hub_download(
108
+ repo_id=self.dataset_repo_id,
109
+ filename="feedback_data.json",
110
+ repo_type="dataset",
111
+ token=self.hf_token,
112
+ cache_dir=str(self.local_dir)
113
+ )
114
+ with open(local_path, 'r', encoding='utf-8') as f:
115
+ existing_data = json.load(f)
116
+ print(f"📥 Loaded {len(existing_data)} existing feedback entries from HF")
117
+ except Exception as e:
118
+ # 文件可能还不存在
119
+ if "404" not in str(e):
120
+ print(f"⚠️ Could not load from HF dataset: {e}")
121
+
122
+ # 如果 HF 加载失败,尝试本地文件
123
+ if not existing_data and self.local_file.exists():
124
+ try:
125
+ with open(self.local_file, 'r', encoding='utf-8') as f:
126
+ existing_data = json.load(f)
127
+ print(f"📥 Loaded {len(existing_data)} existing feedback entries from local")
128
+ except Exception as e:
129
+ print(f"⚠️ Could not load local data: {e}")
130
+
131
+ return existing_data
132
+
133
+ def save_feedback(
134
+ self,
135
+ text: str,
136
+ domain: str,
137
+ statistics: float,
138
+ p_value: float,
139
+ feedback_type: str
140
+ ) -> tuple[bool, str]:
141
+ """
142
+ 保存用户反馈到 HF 数据集和/或本地文件
143
+
144
+ Args:
145
+ text: 被检测的文本
146
+ domain: 选择的领域
147
+ statistics: 统计值
148
+ p_value: p值
149
+ feedback_type: 'expected' 或 'unexpected'
150
+
151
+ Returns:
152
+ (success, message): 是否成功和相关消息
153
+ """
154
+ # 准备反馈数据
155
+ feedback_entry = {
156
+ 'timestamp': datetime.now().isoformat(),
157
+ 'text': text,
158
+ 'domain': domain,
159
+ 'statistics': float(statistics),
160
+ 'p_value': float(p_value),
161
+ 'feedback': feedback_type
162
+ }
163
+
164
+ # 加载现有数据
165
+ feedback_data = self._load_existing_data()
166
+
167
+ # 添加新反馈
168
+ feedback_data.append(feedback_entry)
169
+
170
+ success = False
171
+ messages = []
172
+
173
+ # 保存到本地(作为备份)
174
+ if self.local_backup:
175
+ try:
176
+ with open(self.local_file, 'w', encoding='utf-8') as f:
177
+ json.dump(feedback_data, f, ensure_ascii=False, indent=2)
178
+ messages.append(f"💾 Local backup saved")
179
+ success = True
180
+ except Exception as e:
181
+ messages.append(f"❌ Local save failed: {e}")
182
+
183
+ # 上传到 HF 数据集
184
+ if self.api and self.dataset_repo_id:
185
+ try:
186
+ # 保存为 JSON 文件
187
+ upload_file(
188
+ path_or_fileobj=str(self.local_file),
189
+ path_in_repo="feedback_data.json",
190
+ repo_id=self.dataset_repo_id,
191
+ repo_type="dataset",
192
+ token=self.hf_token,
193
+ commit_message=f"Add feedback: {feedback_type} at {feedback_entry['timestamp']}"
194
+ )
195
+
196
+ # 同时创建/更新 CSV 版本(方便查看)
197
+ df = pd.DataFrame(feedback_data)
198
+ csv_file = self.local_dir / 'feedback_data.csv'
199
+ df.to_csv(csv_file, index=False)
200
+
201
+ upload_file(
202
+ path_or_fileobj=str(csv_file),
203
+ path_in_repo="feedback_data.csv",
204
+ repo_id=self.dataset_repo_id,
205
+ repo_type="dataset",
206
+ token=self.hf_token,
207
+ commit_message=f"Update CSV: {len(feedback_data)} total entries"
208
+ )
209
+
210
+ messages.append(f"☁️ Uploaded to HF dataset: {self.dataset_repo_id}")
211
+ success = True
212
+
213
+ except Exception as e:
214
+ messages.append(f"⚠️ HF upload failed: {e}")
215
+ # 如果 HF 上传失败但本地保存成功,仍然返回成功
216
+ success = success or self.local_backup
217
+
218
+ return success, " | ".join(messages)
219
+
220
+ def get_feedback_stats(self) -> dict:
221
+ """获取反馈统计信息"""
222
+ feedback_data = self._load_existing_data()
223
+
224
+ if not feedback_data:
225
+ return {
226
+ 'total_count': 0,
227
+ 'expected_count': 0,
228
+ 'unexpected_count': 0,
229
+ 'domains': {}
230
+ }
231
+
232
+ df = pd.DataFrame(feedback_data)
233
+ stats = {
234
+ 'total_count': len(df),
235
+ 'expected_count': len(df[df['feedback'] == 'expected']),
236
+ 'unexpected_count': len(df[df['feedback'] == 'unexpected']),
237
+ 'domains': df['domain'].value_counts().to_dict() if 'domain' in df.columns else {}
238
+ }
239
+
240
+ return stats
241
+
242
+
243
+ # 便捷函数(向后兼容)
244
+ _default_manager: Optional[FeedbackManager] = None
245
+
246
+ def init_feedback_manager(dataset_repo_id: str = None, hf_token: str = None):
247
+ """初始化全局反馈管理器"""
248
+ global _default_manager
249
+ _default_manager = FeedbackManager(
250
+ dataset_repo_id=dataset_repo_id,
251
+ hf_token=hf_token
252
+ )
253
+ return _default_manager
254
+
255
+ def save_feedback(text: str, domain: str, statistics: float, p_value: float, feedback_type: str):
256
+ """
257
+ 使用默认管理器保存反馈(向后兼容)
258
+ """
259
+ global _default_manager
260
+ if _default_manager is None:
261
+ # 从环境变量读取配置
262
+ dataset_repo_id = os.environ.get('FEEDBACK_DATASET_ID')
263
+ _default_manager = FeedbackManager(dataset_repo_id=dataset_repo_id)
264
+
265
+ success, message = _default_manager.save_feedback(
266
+ text, domain, statistics, p_value, feedback_type
267
+ )
268
+
269
+ if not success:
270
+ raise Exception(f"Failed to save feedback: {message}")
271
+
272
+ return message