Spaces:

stats-powered-ai
/

StatDetectLLM

Running

App Files Files Community

Jin Zhu commited on Oct 26, 2025

Commit

0fa0487

1 Parent(s): 3ef9054

update

Browse files

Files changed (4) hide show

.gitignore +3 -1
src/FineTune/.gitignore +0 -1
src/app.py +182 -84
src/feedback.py +55 -53

.gitignore CHANGED Viewed

	@@ -1 +1,3 @@
1	- cache/*

+cache/*
+src/feedback_data/*
+src/__pycache__/*

src/FineTune/.gitignore CHANGED Viewed

@@ -8,7 +8,6 @@ __pycache__/
 # Distribution / packaging
 .Python
-ckpt/*
 logs/*/
 models/*/
 build/

 # Distribution / packaging
 .Python
 logs/*/
 models/*/
 build/

src/app.py CHANGED Viewed

@@ -33,6 +33,9 @@ if os.environ.get('SPACE_ID'):
 import streamlit as st
 from FineTune.model import ComputeStat
 import time
 # -----------------
 # Page Configuration
@@ -117,9 +120,87 @@ FEEDBACK_DATASET_ID = os.environ.get('FEEDBACK_DATASET_ID', 'mamba413/user-feedb
 feedback_manager = FeedbackManager(
     dataset_repo_id=FEEDBACK_DATASET_ID,
     hf_token=os.environ.get('HF_TOKEN'),
-    local_backup=True  # 保留本地备份
 )
 # -----------------
 # Configuration
 # -----------------
@@ -128,8 +209,7 @@ MODEL_CONFIG = {
     'from_pretrained': './src/FineTune/ckpt/',
     'base_model': 'gemma-1b',
     'cache_dir': '../cache',
-    # 'device': 'mps',
-    'device': 'cpu',
     # 'device': 'cuda',
 }
@@ -198,7 +278,8 @@ with col1:
         height=200,
     )
-    detect_clicked = st.button("Detect", type="primary", use_container_width=True)
     selected_domain = st.selectbox(
         label="⚙️ Domain (Optional)",
@@ -231,9 +312,7 @@ if detect_clicked:
     if not text_input.strip():
         st.warning("⚠️ Please enter some text before detecting.")
     else:
-        # ========== Reset feedback state ==========
         st.session_state.feedback_given = False
-        # ==========================================
         # Start timing to decide whether to show progress bar
         start_time = time.time()
@@ -268,84 +347,8 @@ if detect_clicked:
                 'elapsed_time': elapsed_time
             }
-            # Update score displays
-            with col2:
-                statistics_ph.text_input(
-                    label="Statistics",
-                    value=f"{crit:.6f}",
-                    disabled=True,
-                    help="Detection statistics will appear here after clicking Detect.",
-                )
-                pvalue_ph.text_input(
-                    label="p-value",
-                    value=f"{p_value:.6f}",
-                    disabled=True,
-                    help="p-value will appear here after clicking Detect.",
-                )
-                st.info(
-                    """
-                    **📊 p-value:**
-                    - **Lower p-value** (closer to 0) indicates text is **more likely AI-generated**
-                    - **Higher p-value** (closer to 1) indicates text is **more likely human-written**
-                    - Generally, p-value < 0.05 suggests the text may be LLM-generated
-                    """,
-                    icon="💡"
-                )
-                # ========== 🆕 Feedback buttons (moved here for better UX) ==========
-                st.markdown("**📝 Result Feedback**: Does this detection result meet your expectations?")
-                current_text = text_input
-                current_domain = selected_domain
-                current_statistics = crit
-                current_pvalue = p_value
-                feedback_col1, feedback_col2 = st.columns(2)
-                with feedback_col1:
-                    if st.button("✅ Expected", use_container_width=True, type="secondary", key=f"expected_btn_{hash(text_input[:50])}"):
-                        try:
-                            success, message = feedback_manager.save_feedback(
-                                current_text,
-                                current_domain,
-                                current_statistics,
-                                current_pvalue,
-                                'expected'
-                            )
-                            if success:
-                                st.success("✅ Thank you for your feedback!")
-                                st.caption(f"💾 {message}")
-                            else:
-                                st.error(f"Failed to save feedback: {message}")
-                        except Exception as e:
-                            st.error(f"Failed to save feedback: {str(e)}")
-                            import traceback
-                            st.code(traceback.format_exc())
-                with feedback_col2:
-                    if st.button("❌ Unexpected", use_container_width=True, type="secondary", key=f"unexpected_btn_{hash(text_input[:50])}"):
-                        try:
-                            success, message = feedback_manager.save_feedback(
-                                current_text,
-                                current_domain,
-                                current_statistics,
-                                current_pvalue,
-                                'unexpected'
-                            )
-                            if success:
-                                st.warning("❌ Feedback recorded! This will help us improve.")
-                                st.caption(f"💾 {message}")
-                            else:
-                                st.error(f"Failed to save feedback: {message}")
-                        except Exception as e:
-                            st.error(f"Failed to save feedback: {str(e)}")
-                            import traceback
-                            st.code(traceback.format_exc())
-                if st.session_state.feedback_given:
-                    st.success("✅ Feedback submitted successfully!")
-                # ============================================
             # Show detailed results
             with result_placeholder:
@@ -356,6 +359,101 @@ if detect_clicked:
             st.error(f"❌ Error during detection: {str(e)}")
             st.exception(e)
 # ========== 🆕 Citation and paper reference section ==========
 # st.markdown("---")
 # st.markdown(

 import streamlit as st
 from FineTune.model import ComputeStat
 import time
+# 🆕 new imports
+import json
+import datetime
 # -----------------
 # Page Configuration
 feedback_manager = FeedbackManager(
     dataset_repo_id=FEEDBACK_DATASET_ID,
     hf_token=os.environ.get('HF_TOKEN'),
+    local_backup=False if os.environ.get('SPACE_ID') else True  # 保留本地备份
 )
+# 🆕 Incremental feedback saver for HF Spaces
+IS_SPACE = bool(os.environ.get('SPACE_ID'))
+@st.cache_resource
+def get_feedback_repo(dataset_repo_id: str, hf_token: str):
+    if not IS_SPACE:
+        return None
+    try:
+        from huggingface_hub import login, Repository
+        if hf_token:
+            login(token=hf_token)
+        local_dir = Path('/tmp') / ('hf_ds_' + dataset_repo_id.replace('/', '__'))
+        local_dir.mkdir(parents=True, exist_ok=True)
+        repo = Repository(
+            local_dir=str(local_dir),
+            clone_from=dataset_repo_id,
+            repo_type="dataset",
+            token=hf_token,
+        )
+        return repo
+    except Exception as e:
+        print(f"[feedback repo] init failed: {e}")
+        return None
+def save_feedback_incremental(text: str, domain: str, statistics: float, p_value: float, label: str):
+    """
+    Append a single feedback record to a date-sharded NDJSON file and push.
+    Falls back to FeedbackManager on error or non-space environments.
+    """
+    try:
+        repo = get_feedback_repo(FEEDBACK_DATASET_ID, os.environ.get('HF_TOKEN'))
+        if repo is None:
+            # Fallback (local or repo init failed)
+            return feedback_manager.save_feedback(text, domain, statistics, p_value, label)
+        # Pull latest, append, commit, push
+        try:
+            repo.git_pull(rebase=True)
+        except Exception as e:
+            print(f"[feedback repo] pull warning: {e}")
+        now = datetime.datetime.utcnow()
+        date_str = now.strftime("%Y-%m-%d")
+        payload = {
+            "timestamp": now.isoformat(timespec="seconds") + "Z",
+            "space_id": os.environ.get('SPACE_ID'),
+            "domain": domain,
+            "label": label,
+            "statistics": statistics,
+            "p_value": p_value,
+            "text": text,
+            "app_version": "adadetectgpt-app-1",  # optional tag
+        }
+        feedback_dir = Path(repo.local_dir) / "feedback"
+        feedback_dir.mkdir(parents=True, exist_ok=True)
+        file_path = feedback_dir / f"{date_str}.ndjson"
+        with open(file_path, "a", encoding="utf-8") as f:
+            f.write(json.dumps(payload, ensure_ascii=False) + "\n")
+        # Commit and push only the changed file to minimize traffic
+        repo.git_add(pattern=str(file_path))
+        try:
+            repo.git_commit(f"feedback: append {file_path.name}")
+        except Exception as e:
+            # allow empty commit errors to pass silently if nothing changed
+            print(f"[feedback repo] commit info: {e}")
+        repo.git_push()
+        return True, f"Pushed to {FEEDBACK_DATASET_ID}:{file_path.name}"
+    except Exception as e:
+        # Final fallback if anything goes wrong
+        print(f"[feedback repo] incremental save failed: {e}")
+        try:
+            return feedback_manager.save_feedback(text, domain, statistics, p_value, label)
+        except Exception as e2:
+            return False, f"Fallback failed: {e2}"
 # -----------------
 # Configuration
 # -----------------
     'from_pretrained': './src/FineTune/ckpt/',
     'base_model': 'gemma-1b',
     'cache_dir': '../cache',
+    'device': 'cpu' if os.environ.get('SPACE_ID') else 'mps',
     # 'device': 'cuda',
 }
         height=200,
     )
+    # Add a stable key to the Detect button
+    detect_clicked = st.button("Detect", type="primary", use_container_width=True, key="detect_btn")
     selected_domain = st.selectbox(
         label="⚙️ Domain (Optional)",
     if not text_input.strip():
         st.warning("⚠️ Please enter some text before detecting.")
     else:
         st.session_state.feedback_given = False
         # Start timing to decide whether to show progress bar
         start_time = time.time()
                 'elapsed_time': elapsed_time
             }
+            # NOTE: Do not render results and feedback here; they are rendered below
+            # based on st.session_state.last_detection so buttons persist across reruns.
             # Show detailed results
             with result_placeholder:
             st.error(f"❌ Error during detection: {str(e)}")
             st.exception(e)
+# ================= Result & Feedback rendering (persistent across reruns) =================
+if st.session_state.last_detection:
+    data = st.session_state.last_detection
+    with col2:
+        # Update score displays
+        statistics_ph.text_input(
+            label="Statistics",
+            value=f"{data['statistics']:.6f}",
+            disabled=True,
+            help="Detection statistics will appear here after clicking Detect.",
+        )
+        pvalue_ph.text_input(
+            label="p-value",
+            value=f"{data['p_value']:.6f}",
+            disabled=True,
+            help="p-value will appear here after clicking Detect.",
+        )
+        st.info(
+            """
+            **📊 p-value:**
+            - **Lower p-value** (closer to 0) indicates text is **more likely AI-generated**
+            - **Higher p-value** (closer to 1) indicates text is **more likely human-written**
+            - Generally, p-value < 0.05 suggests the text may be LLM-generated
+            """,
+            icon="💡"
+        )
+        st.markdown("**📝 Result Feedback**: Does this detection result meet your expectations?")
+        current_text = data['text']
+        current_domain = data['domain']
+        current_statistics = data['statistics']
+        current_pvalue = data['p_value']
+        feedback_col1, feedback_col2 = st.columns(2)
+        with feedback_col1:
+            # Add a stable, unique key so click state is captured on rerun
+            expected_click = st.button(
+                "✅ Expected",
+                use_container_width=True,
+                type="secondary",
+                key=f"expected_btn_{hash(current_text[:50])}"
+            )
+            print("--------------------------------------------------")
+            print(f"Expected button clicked: {expected_click}")
+            if expected_click and not st.session_state.feedback_given:
+                try:
+                    # 🆕 use incremental saver (auto-fallbacks when needed)
+                    success, message = save_feedback_incremental(
+                        current_text,
+                        current_domain,
+                        current_statistics,
+                        current_pvalue,
+                        'expected'
+                    )
+                    if success:
+                        st.success("✅ Thanks for your positive feedback!")
+                        st.session_state.feedback_given = True
+                    else:
+                        st.error(f"Failed to save feedback: {message}")
+                except Exception as e:
+                    st.error(f"Failed to save feedback: {str(e)}")
+                    import traceback
+                    st.code(traceback.format_exc())
+        with feedback_col2:
+            unexpected_click = st.button(
+                "❌ Unexpected",
+                use_container_width=True,
+                type="secondary",
+                key=f"unexpected_btn_{hash(current_text[:50])}"
+            )
+            if unexpected_click and not st.session_state.feedback_given:
+                try:
+                    # 🆕 use incremental saver (auto-fallbacks when needed)
+                    success, message = save_feedback_incremental(
+                        current_text,
+                        current_domain,
+                        current_statistics,
+                        current_pvalue,
+                        'unexpected'
+                    )
+                    if success:
+                        st.warning("Feedback recorded! This will help us improve.")
+                        st.session_state.feedback_given = True
+                    else:
+                        st.error(f"Failed to save feedback: {message}")
+                except Exception as e:
+                    st.error(f"Failed to save feedback: {str(e)}")
+                    import traceback
+                    st.code(traceback.format_exc())
 # ========== 🆕 Citation and paper reference section ==========
 # st.markdown("---")
 # st.markdown(

src/feedback.py CHANGED Viewed

@@ -6,6 +6,60 @@ from huggingface_hub import HfApi, upload_file, hf_hub_download
 from typing import Optional
 import pandas as pd
 class FeedbackManager:
     """管理用户反馈，支持保存到 Hugging Face 私有数据集"""
@@ -31,7 +85,6 @@ class FeedbackManager:
         if self.dataset_repo_id and self.hf_token:
             self.api = HfApi(token=self.hf_token)
             # 确保数据集存在
-            self._ensure_dataset_exists()
         else:
             self.api = None
             print("⚠️ No HF dataset configured. Will only save locally.")
@@ -44,58 +97,7 @@ class FeedbackManager:
         self.local_dir.mkdir(exist_ok=True, parents=True)
         self.local_file = self.local_dir / 'user_feedback.json'
-    def _ensure_dataset_exists(self):
-        """确保 HF 数据集存在，如果不存在则创建"""
-        try:
-            from huggingface_hub import create_repo
-            # 尝试创建数据集仓库（如果已存在会抛出异常）
-            try:
-                create_repo(
-                    repo_id=self.dataset_repo_id,
-                    token=self.hf_token,
-                    private=True,
-                    repo_type="dataset"
-                )
-                print(f"✅ Created new private dataset: {self.dataset_repo_id}")
-                # 创建初始的 README.md
-                readme_content = f"""---
-license: mit
----
-# AdaDetectGPT User Feedback Dataset
-This dataset contains user feedback from the AdaDetectGPT detection system.
-## Data Format
-Each entry contains:
-- `timestamp`: When the feedback was submitted
-- `text`: The text that was analyzed
-- `domain`: The domain selected for analysis
-- `statistics`: The computed statistics value
-- `p_value`: The p-value from the detection
-- `feedback`: User feedback (expected/unexpected)
-"""
-                readme_file = self.local_dir / 'README.md'
-                readme_file.write_text(readme_content)
-                upload_file(
-                    path_or_fileobj=str(readme_file),
-                    path_in_repo="README.md",
-                    repo_id=self.dataset_repo_id,
-                    repo_type="dataset",
-                    token=self.hf_token
-                )
-            except Exception as e:
-                if "already exists" not in str(e):
-                    print(f"⚠️ Dataset check: {e}")
-        except Exception as e:
-            print(f"⚠️ Could not verify dataset: {e}")
     def _load_existing_data(self) -> list:
         """从 HF 数据集加载现有数据"""
         existing_data = []

 from typing import Optional
 import pandas as pd
+def save_feedback_incremental(text: str, domain: str, statistics: float, p_value: float, label: str):
+    """
+    Append a single feedback record to a date-sharded NDJSON file and push.
+    Falls back to FeedbackManager on error or non-space environments.
+    """
+    try:
+        repo = get_feedback_repo(FEEDBACK_DATASET_ID, os.environ.get('HF_TOKEN'))
+        if repo is None:
+            # Fallback (local or repo init failed)
+            return feedback_manager.save_feedback(text, domain, statistics, p_value, label)
+        # Pull latest, append, commit, push
+        try:
+            repo.git_pull(rebase=True)
+        except Exception as e:
+            print(f"[feedback repo] pull warning: {e}")
+        now = datetime.datetime.utcnow()
+        date_str = now.strftime("%Y-%m-%d")
+        payload = {
+            "timestamp": now.isoformat(timespec="seconds") + "Z",
+            "space_id": os.environ.get('SPACE_ID'),
+            "domain": domain,
+            "label": label,
+            "statistics": statistics,
+            "p_value": p_value,
+            "text": text,
+            "app_version": "adadetectgpt-app-1",  # optional tag
+        }
+        feedback_dir = Path(repo.local_dir) / "feedback"
+        feedback_dir.mkdir(parents=True, exist_ok=True)
+        file_path = feedback_dir / f"{date_str}.ndjson"
+        with open(file_path, "a", encoding="utf-8") as f:
+            f.write(json.dumps(payload, ensure_ascii=False) + "\n")
+        # Commit and push only the changed file to minimize traffic
+        repo.git_add(pattern=str(file_path))
+        try:
+            repo.git_commit(f"feedback: append {file_path.name}")
+        except Exception as e:
+            # allow empty commit errors to pass silently if nothing changed
+            print(f"[feedback repo] commit info: {e}")
+        repo.git_push()
+        return True, f"Pushed to {FEEDBACK_DATASET_ID}:{file_path.name}"
+    except Exception as e:
+        # Final fallback if anything goes wrong
+        print(f"[feedback repo] incremental save failed: {e}")
+        try:
+            return feedback_manager.save_feedback(text, domain, statistics, p_value, label)
+        except Exception as e2:
+            return False, f"Fallback failed: {e2}"
 class FeedbackManager:
     """管理用户反馈，支持保存到 Hugging Face 私有数据集"""
         if self.dataset_repo_id and self.hf_token:
             self.api = HfApi(token=self.hf_token)
             # 确保数据集存在
         else:
             self.api = None
             print("⚠️ No HF dataset configured. Will only save locally.")
         self.local_dir.mkdir(exist_ok=True, parents=True)
         self.local_file = self.local_dir / 'user_feedback.json'
     def _load_existing_data(self) -> list:
         """从 HF 数据集加载现有数据"""
         existing_data = []