Spaces:

JiachenFu
/

DetectAnyLLM

Running on Zero

App Files Files Community

JiachenFu commited on Sep 22

Commit

6600152

0 Parent(s):

update: app

Browse files

Files changed (9) hide show

.gitignore +165 -0
.gradio/certificate.pem +31 -0
.gradio/flagged/dataset1.csv +64 -0
DetectAnyLLM +1 -0
LICENSE +35 -0
README.md +14 -0
app.py +539 -0
core/model.py +255 -0
requirements.txt +10 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,165 @@

+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+# C extensions
+*.so
+# Distribution / packaging
+.Python
+ckpt/*/
+logs/*/
+models/*/
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+cover/
+# Translations
+*.mo
+*.pot
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+# Flask stuff:
+instance/
+.webassets-cache
+# Scrapy stuff:
+.scrapy
+# Sphinx documentation
+docs/_build/
+# PyBuilder
+.pybuilder/
+target/
+# Jupyter Notebook
+.ipynb_checkpoints
+# IPython
+profile_default/
+ipython_config.py
+# pyenv
+#   For a library or package, you might want to ignore these files since the code is
+#   intended to run in multiple environments; otherwise, check them in:
+# .python-version
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+# poetry
+#   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
+#   This is especially recommended for binary packages to ensure reproducibility, and is more
+#   commonly ignored for libraries.
+#   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
+#poetry.lock
+# pdm
+#   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
+#pdm.lock
+#   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
+#   in version control.
+#   https://pdm.fming.dev/latest/usage/project/#working-with-version-control
+.pdm.toml
+.pdm-python
+.pdm-build/
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
+__pypackages__/
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+# SageMath parsed files
+*.sage.py
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+# Spyder project settings
+.spyderproject
+.spyproject
+# Rope project settings
+.ropeproject
+# mkdocs documentation
+/site
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+# Pyre type checker
+.pyre/
+# pytype static type analyzer
+.pytype/
+# Cython debug symbols
+cython_debug/
+# PyCharm
+#  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
+#  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
+#  and can be added to the global gitignore or merged into this file.  For a more nuclear
+#  option (not recommended) you can uncomment the following to ignore the entire idea folder.
+#.idea/

.gradio/certificate.pem ADDED Viewed

	@@ -0,0 +1,31 @@

+-----BEGIN CERTIFICATE-----
+MIIFazCCA1OgAwIBAgIRAIIQz7DSQONZRGPgu2OCiwAwDQYJKoZIhvcNAQELBQAw
+TzELMAkGA1UEBhMCVVMxKTAnBgNVBAoTIEludGVybmV0IFNlY3VyaXR5IFJlc2Vh
+cmNoIEdyb3VwMRUwEwYDVQQDEwxJU1JHIFJvb3QgWDEwHhcNMTUwNjA0MTEwNDM4
+WhcNMzUwNjA0MTEwNDM4WjBPMQswCQYDVQQGEwJVUzEpMCcGA1UEChMgSW50ZXJu
+ZXQgU2VjdXJpdHkgUmVzZWFyY2ggR3JvdXAxFTATBgNVBAMTDElTUkcgUm9vdCBY
+MTCCAiIwDQYJKoZIhvcNAQEBBQADggIPADCCAgoCggIBAK3oJHP0FDfzm54rVygc
+h77ct984kIxuPOZXoHj3dcKi/vVqbvYATyjb3miGbESTtrFj/RQSa78f0uoxmyF+
+0TM8ukj13Xnfs7j/EvEhmkvBioZxaUpmZmyPfjxwv60pIgbz5MDmgK7iS4+3mX6U
+A5/TR5d8mUgjU+g4rk8Kb4Mu0UlXjIB0ttov0DiNewNwIRt18jA8+o+u3dpjq+sW
+T8KOEUt+zwvo/7V3LvSye0rgTBIlDHCNAymg4VMk7BPZ7hm/ELNKjD+Jo2FR3qyH
+B5T0Y3HsLuJvW5iB4YlcNHlsdu87kGJ55tukmi8mxdAQ4Q7e2RCOFvu396j3x+UC
+B5iPNgiV5+I3lg02dZ77DnKxHZu8A/lJBdiB3QW0KtZB6awBdpUKD9jf1b0SHzUv
+KBds0pjBqAlkd25HN7rOrFleaJ1/ctaJxQZBKT5ZPt0m9STJEadao0xAH0ahmbWn
+OlFuhjuefXKnEgV4We0+UXgVCwOPjdAvBbI+e0ocS3MFEvzG6uBQE3xDk3SzynTn
+jh8BCNAw1FtxNrQHusEwMFxIt4I7mKZ9YIqioymCzLq9gwQbooMDQaHWBfEbwrbw
+qHyGO0aoSCqI3Haadr8faqU9GY/rOPNk3sgrDQoo//fb4hVC1CLQJ13hef4Y53CI
+rU7m2Ys6xt0nUW7/vGT1M0NPAgMBAAGjQjBAMA4GA1UdDwEB/wQEAwIBBjAPBgNV
+HRMBAf8EBTADAQH/MB0GA1UdDgQWBBR5tFnme7bl5AFzgAiIyBpY9umbbjANBgkq
+hkiG9w0BAQsFAAOCAgEAVR9YqbyyqFDQDLHYGmkgJykIrGF1XIpu+ILlaS/V9lZL
+ubhzEFnTIZd+50xx+7LSYK05qAvqFyFWhfFQDlnrzuBZ6brJFe+GnY+EgPbk6ZGQ
+3BebYhtF8GaV0nxvwuo77x/Py9auJ/GpsMiu/X1+mvoiBOv/2X/qkSsisRcOj/KK
+NFtY2PwByVS5uCbMiogziUwthDyC3+6WVwW6LLv3xLfHTjuCvjHIInNzktHCgKQ5
+ORAzI4JMPJ+GslWYHb4phowim57iaztXOoJwTdwJx4nLCgdNbOhdjsnvzqvHu7Ur
+TkXWStAmzOVyyghqpZXjFaH3pO3JLF+l+/+sKAIuvtd7u+Nxe5AW0wdeRlN8NwdC
+jNPElpzVmbUq4JUagEiuTDkHzsxHpFKVK7q4+63SM1N95R1NbdWhscdCb+ZAJzVc
+oyi3B43njTOQ5yOf+1CceWxG1bQVs5ZufpsMljq4Ui0/1lvh+wjChP4kqKOJ2qxq
+4RgqsahDYVvTH9w7jXbyLeiNdd8XM2w9U/t7y0Ff/9yi0GE44Za4rF2LN9d11TPA
+mRGunUHBcnWEvgJBQl9nJEiU0Zsnvgc/ubhPgXRR4Xq37Z0j4r7g1SgEEzwxA57d
+emyPxgcYxn/eR44/KJ4EBs+lVDR3veyJm+kXQ99b21/+jh5Xos1AnX5iItreGCc=
+-----END CERTIFICATE-----

.gradio/flagged/dataset1.csv ADDED Viewed

	@@ -0,0 +1,64 @@

+input_text,output,timestamp
+"'def greet(input_text):
+    sub_texts = text_splitter.split_text(input_text)  # 修改为split_text获取文本列表
+    html_output = []
+    for sub_text in sub_texts:
+        tokenized = scoring_tokenizer(sub_text, truncation=True, return_tensors=""pt"", padding=True, return_token_type_ids=False).to(device)
+        labels = tokenized.input_ids[:, 1:]
+        with torch.no_grad():
+            logits_score = scoring_model(**tokenized).logits[:, :-1]
+            logits_ref = logits_score
+            crit, _ = criterion_fn(logits_ref, logits_score, labels)
+        crit = crit.cpu().numpy().item()
+        prob = prob_estimator.crit_to_prob(crit)
+        # 根据概率值设置颜色
+        if prob >= 0.7:
+            color = ""red""
+        elif prob >= 0.3:
+            color = ""orange""
+        else:
+            color = ""white""
+        # 创建带样式的HTML内容
+        html_output.append(f'<span style=""color: {color};"">{sub_text} (Probability: {prob:.2f})</span>')
+    # 用换行连接所有结果
+    return ""<br>"".join(html_output)
+demo = gr.Interface(
+    fn=greet,
+    inputs=[""text""],
+    outputs=gr.HTML()  # 修改为HTML输出组件
+)","'<span style=""color: white;"">def greet(input_text):
+    sub_texts = text_splitter.split_text(input_text)  # 修改为split_text获取文本列表
+    html_output = []
+    for (Probability: 0.09)</span><br><span style=""color: white;"">sub_text in sub_texts:
+        tokenized = scoring_tokenizer(sub_text, truncation=True, return_tensors=""pt"", padding=True, retur (Probability: 0.03)</span><br><span style=""color: white;"">n_token_type_ids=False).to(device)
+        labels = tokenized.input_ids[:, 1:]
+        with torch.no_grad():
+            logits_ (Probability: 0.05)</span><br><span style=""color: white;"">score = scoring_model(**tokenized).logits[:, :-1]
+            logits_ref = logits_score
+            crit, _ = criterion_fn(logit (Probability: 0.00)</span><br><span style=""color: white;"">s_ref, logits_score, labels)
+        crit = crit.cpu().numpy().item()
+        prob = prob_estimator.crit_to_prob(crit) (Probability: 0.02)</span><br><span style=""color: white;""># 根据概率值设置颜色
+        if prob >= 0.7:
+            color = ""red""
+        elif prob >= 0.3:
+            color = ""or (Probability: 0.09)</span><br><span style=""color: white;"">ange""
+        else:
+            color = ""white""
+        # 创建带样式的HTML内容
+        html_output.append(f'<span style=""color: (Probability: 0.19)</span><br><span style=""color: white;"">{color};"">{sub_text} (Probability: {prob:.2f})</span>')
+    # 用换行连接所有结果
+    return ""<br>"".join(html_output)
+demo = gr.Int (Probability: 0.01)</span><br><span style=""color: white;"">erface(
+    fn=greet,
+    inputs=[""text""],
+    outputs=gr.HTML()  # 修改为HTML输出组件
+) (Probability: 0.06)</span>",2025-01-30 11:44:36.020197

DetectAnyLLM ADDED Viewed

	@@ -0,0 +1 @@


1	+ Subproject commit 2d182abad5143fc1183cfedca2a30f58c3d44e7e

LICENSE ADDED Viewed

	@@ -0,0 +1,35 @@

+Pi-Lab License 1.0
+Copyright 2025 Pi-Lab
+Redistribution and use for non-commercial purpose in source and
+binary forms, with or without modification, are permitted provided
+that the following conditions are met:
+1. Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+   notice, this list of conditions and the following disclaimer in
+   the documentation and/or other materials provided with the
+   distribution.
+3. Neither the name of the copyright holder nor the names of its
+   contributors may be used to endorse or promote products derived
+   from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+In the event that redistribution and/or use for commercial purpose in
+source or binary forms, with or without modification is required,
+please contact the contributor(s) of the work.

README.md ADDED Viewed

	@@ -0,0 +1,14 @@

+---
+title: DetectAnyLLM
+emoji: 🔥
+colorFrom: pink
+colorTo: blue
+sdk: gradio
+sdk_version: 5.46.1
+app_file: app.py
+pinned: false
+license: other
+short_description: '[ACMMM 2025] State-Of-The-Art AI-Text Detector'
+---
+Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

app.py ADDED Viewed

	@@ -0,0 +1,539 @@

+import gradio as gr
+import numpy as np
+import torch
+import os
+import json
+from core.model import DiscrepancyEstimator
+import re
+import docx
+import spaces
+from datasets import load_dataset
+def read_file_content(file):
+    if file is None:
+        return ""
+    if file.name.endswith('.txt'):
+        with open(file.name, 'r', encoding='utf-8') as f:
+            return f.read()
+    elif file.name.endswith('.docx'):
+        doc = docx.Document(file.name)
+        full_text = []
+        for para in doc.paragraphs:
+            full_text.append(para.text)
+        return '\n'.join(full_text)
+    return ""
+def split_sentences(text):
+    """根据句号、句点、分号分割文本成句子，同时保留分句符号。"""
+    sentences = re.split(r'([。.])', text)
+    combined_sentences = [sentences[i] + sentences[i+1] for i in range(0, len(sentences)-1, 2)]
+    if len(sentences) % 2 == 1:
+        combined_sentences.append(sentences[-1])
+    return [s.strip() for s in combined_sentences if s.strip()]
+def count_words(sentence, language='Chinese'):
+    """统计句子的词数。"""
+    return len(sentence.replace('\n', '').replace('\r', '').split()) if language != 'Chinese' else len(sentence.replace('\n', '').replace('\r', ''))
+def segment_text(sentences, language='Chinese'):
+    """按照要求拼接句子，确保不忽略第一段并处理最后一句话不足100词的情况。"""
+    result = []
+    current_segment = []
+    current_length = 0
+    for i, sentence in enumerate(sentences):
+        word_count = count_words(sentence, language)
+        if word_count > 100:
+            # 如果单个句子超过100词，考虑拼接
+            if i + 1 < len(sentences) and word_count + count_words(sentences[i + 1], language) <= 200:
+                # 拼接当前和下一个句子
+                if current_segment:  # 先保存当前段
+                    result.append(' '.join(current_segment) if language != 'Chinese' else ''.join(current_segment))
+                result.append((sentence + ' ' + sentences[i + 1]) if language != 'Chinese' else (sentence + sentences[i + 1]))
+                current_segment = []
+                current_length = 0
+                i += 1  # 跳过下一个句子
+                continue
+            else:
+                # 单独存放
+                if current_segment:  # 先保存当前段
+                    result.append(' '.join(current_segment) if language != 'Chinese' else ''.join(current_segment))
+                result.append(sentence)
+                current_segment = []
+                current_length = 0
+        else:
+            if current_length + word_count > 100:
+                # 当前段超过100词，保存并开始新段
+                if current_segment:
+                    result.append(' '.join(current_segment) if language != 'Chinese' else ''.join(current_segment))
+                current_segment = [sentence]
+                current_length = word_count
+            else:
+                # 继续累积
+                current_segment.append(sentence)
+                current_length += word_count
+    # 处理最后一段
+    if current_segment:
+        if current_length < 100 and result and current_length + count_words(result[-1], language) <= 200:
+            # 如果最后一段不足100词，且可以与前一段合并
+            last_segment = result.pop() if result else ''
+            current_segment = (last_segment.split() if language != 'Chinese' else list(last_segment)) + current_segment
+            result.append(' '.join(current_segment) if language != 'Chinese' else ''.join(current_segment))
+        else:
+            # 直接添加最后一段
+            result.append(' '.join(current_segment) if language != 'Chinese' else ''.join(current_segment))
+    return result
+def extract_latex_text(latex_source):
+    # 提取document环境中的内容
+    doc_pattern = re.compile(r'\\begin{document}(.*?)\\end{document}', re.DOTALL)
+    match = doc_pattern.search(latex_source)
+    content = match.group(1) if match else latex_source
+    # 删除注释（排除转义后的%）
+    content = re.sub(r'(?<!\\)%.*', '', content, flags=re.MULTILINE)
+    # 排除常见非文本环境
+    excluded_envs = ['figure', 'table', 'equation', 'align\*?', 'verbatim', 'lstlisting']
+    env_pattern = re.compile(
+        r'\\begin{(' + '|'.join(excluded_envs) + r')}.*?\\end{\1}',
+        re.DOTALL
+    )
+    content = env_pattern.sub('', content)
+    # 新增处理：删除所有cite命令及其内容
+    content = re.sub(r'\\cite(\[[^\]]*\])?\{[^}]*\}', '', content)
+    # 新增处理：删除行内table/figure命令及其内容
+    content = re.sub(r'\\(table|figure)\*?(\[[^\]]*\])?\{[^}]*\}', '', content)
+    # 删除简单命令（无参数）
+    content = re.sub(r'\\([a-zA-Z]+)\*?\b', '', content)
+    # 递归处理带参数的命令（最多迭代10次防止死循环）
+    for _ in range(10):
+        new_content = re.sub(
+            r'\\([a-zA-Z]+)\*?(?:\[.*?\])*{((?:[^{}]*|{[^{}]*})*)}',
+            lambda m: m.group(2),
+            content,
+            flags=re.DOTALL
+        )
+        if new_content == content:
+            break
+        content = new_content
+    # 处理特殊字符
+    replacements = {
+        '~': ' ', '\\&': '&', '\\$': '$', '\\%': '%',
+        '\\_': '_', '\\#': '#', '\\\\': '\n', '\n': ' ',
+        '“': '"', '”': '"', '‘': "'", '’': "'"
+    }
+    for k, v in replacements.items():
+        content = content.replace(k, v)
+    # 清理空白字符
+    content = re.sub(r'[ \t]+', ' ', content)
+    content = re.sub(r'\n{2,}', '\n\n', content)
+    return content.strip()
+class ProbEstimator:
+    def __init__(self, ref_file_dir):
+        self.tasks = ["polish", "generate", "rewrite"]
+        self.real_crits = {"polish": [], "generate": [], "rewrite": []}
+        self.fake_crits = {"polish": [], "generate": [], "rewrite": []}
+        for task in self.tasks:
+            task_ref_data = load_dataset(ref_file_dir, data_files=f'{task}.json')['train']
+            self.real_crits[task].extend(task_ref_data['original_discrepancy'])
+            self.fake_crits[task].extend(task_ref_data['rewritten_discrepancy'])
+        print(f'ProbEstimator: total {sum([len(self.real_crits[task]) for task in self.tasks]) * 2} samples.')
+    def crit_to_prob(self, crit):
+        probs = {}
+        for task in self.tasks:
+            real_crits = self.real_crits[task]
+            fake_crits = self.fake_crits[task]
+            total_len = len(real_crits) + len(fake_crits)
+            offset = np.sort(np.abs(np.array(real_crits + fake_crits) - crit))[int(0.1*total_len)]
+            cnt_real = np.sum((np.array(real_crits) > crit - offset) & (np.array(real_crits) < crit + offset))
+            cnt_fake = np.sum((np.array(fake_crits) > crit - offset) & (np.array(fake_crits) < crit + offset))
+            probs[task] = (cnt_fake / (cnt_real + cnt_fake)) if (cnt_real + cnt_fake) > 0 else 0.5
+        return probs
+device = 'cuda'
+zh_prob_estimator = ProbEstimator(ref_file_dir="JiachenFu/Qwen2-0.5B-detectanyllm-detector-ref-zh")
+en_prob_estimator = ProbEstimator(ref_file_dir="JiachenFu/Qwen2-0.5B-detectanyllm-detector-ref-en")
+@spaces.GPU
+def greet(mode, language, input_text):
+    if mode == "LaTex":
+        input_text = extract_latex_text(input_text)
+    split_texts = split_sentences(input_text)
+    sub_texts = segment_text(split_texts, language=language)
+    detected = []
+    if language == "Chinese":
+        model = DiscrepancyEstimator(load_directory="JiachenFu/Qwen2-0.5B-detectanyllm-detector-zh").to(device)
+        prob_estimator = zh_prob_estimator
+    else:
+        model = DiscrepancyEstimator(load_directory="JiachenFu/Qwen2-0.5B-detectanyllm-detector-en").to(device)
+        prob_estimator = en_prob_estimator
+    model.eval()
+    for i, sub_text in enumerate(sub_texts):
+        text_content = sub_text
+        print(f'processing {sub_text}')
+        tokens = model.scoring_tokenizer(
+            text_content, return_tensors='pt', padding=True, truncation=True, return_token_type_ids=False
+        )
+        print(f'tokenized')
+        input_ids = tokens['input_ids'].to(device)
+        attention_mask = tokens['attention_mask'].to(device)
+        with torch.no_grad():
+            output = model.get_discrepancy_of_scoring_and_reference_models(
+                input_ids_for_scoring_model=input_ids,
+                attention_mask_for_scoring_model=attention_mask,
+                input_ids_for_reference_model=None,
+                attention_mask_for_reference_model=None,
+            )
+            discrepancy = output['scoring_discrepancy']
+        discrepancy = discrepancy.cpu().numpy().item()
+        print(f'discrepancy: {discrepancy}')
+        probs = prob_estimator.crit_to_prob(discrepancy)
+        if discrepancy < 15:
+            for task in probs.keys():
+                probs[task] = 0.0
+        detected.append({
+            'order': i,
+            'text': text_content,
+            'words_count': len(text_content) if language == "Chinese" else len(text_content.split()),
+            'probs': probs
+        })
+    # 添加绝对定位的总概率显示
+    # 构建动画效果
+    html_output = '''
+    <style>
+        @keyframes reveal {
+            from { opacity: 0; }
+            to { opacity: 1; }
+        }
+        .reveal-char {
+            opacity: 0;
+            animation: reveal 0.2s forwards;
+            white-space: pre-wrap;
+        }
+    </style>
+    <div style="position: relative; padding-bottom: 60px; min-height: 120px;">
+    '''
+    current_delay = 0.0  # 当前动画延迟时间
+    char_duration = 0.001  # 每个字符的间隔时间
+    # 处理文本内容
+    for item in detected:
+        ai_generate_prob = item['probs']['generate']
+        ai_revise_prob = max(item['probs']['polish'], item['probs']['rewrite'])
+        prob = max(ai_generate_prob, ai_revise_prob)
+        if prob >= 0.75:
+            if ai_generate_prob >= ai_revise_prob:
+                color = "red"
+                item["generate"] = 1
+                item["revise"] = 0
+            else:
+                color = "orange"
+                item["generate"] = 0
+                item["revise"] = 1
+        else:
+            color = "black"
+            item["generate"] = 0
+            item["revise"] = 0
+        for char in item['text']:
+            html_output += f'<span class="reveal-char" style="color: {color}; animation-delay: {current_delay:.2f}s;">{char}</span>'
+            current_delay += char_duration
+    total_length = sum(item['words_count'] for item in detected)
+    # total_prob = sum(item['prob'] * item['words_count'] for item in detected) / total_length if total_length > 0 else 0
+    generate_prob = sum(item["generate"] * item["words_count"] for item in detected) / total_length if total_length > 0 else 0
+    revise_prob = sum(item["revise"] * item["words_count"] for item in detected) / total_length if total_length > 0 else 0
+    html_output += f'''
+    <div style="
+        position: absolute;
+        bottom: 0;
+        right: 0;
+        background-color: rgba(255, 255, 255, 0.9);
+        padding: 8px 12px;
+        border-radius: 4px;
+        box-shadow: 0 2px 4px rgba(0,0,0,0.1);
+        border: 1px solid #e0e0e0;
+        font-size: 14px;
+    ">
+        🤖 AI Generated Rate: <strong>{generate_prob:.2%}</strong><br>
+        ✍️ AI Revised Rate: <strong>{revise_prob:.2%}</strong>
+    </div>
+    '''
+    html_output += '</div>'
+    return html_output
+# 使用Blocks替代Interface以获得更好的自定义能力
+# 修改CSS部分
+with gr.Blocks(css="""
+    @import url('https://fonts.googleapis.com/css2?family=Inter:wght@400;500;600;700;800&display=swap');
+    :root {
+        --accent-color: #6366f1;
+        --text-color: #374151;
+        --border-color: #e5e7eb;
+        --background-light: #f9fafb;
+        --background-card: #ffffff;
+    }
+    body, .gradio-container {
+        background: var(--background-light);
+        font-family: 'Inter', sans-serif;
+        color: var(--text-color);
+    }
+    #header {
+        text-align: center;
+        padding: 2rem;
+        margin: 0 auto; /* Use gap for spacing, remove margin-bottom */
+        background-color: var(--background-card);
+        background-image: url("data:image/svg+xml,%3Csvg xmlns='http://www.w3.org/2000/svg' width='40' height='40' viewBox='0 0 40 40'%3E%3Cg fill-rule='evenodd'%3E%3Cg fill='%23e5e7eb' fill-opacity='0.3'%3E%3Cpath d='M0 38.59l2.83-2.83 1.41 1.41L1.41 40H0v-1.41zM0 1.4l2.83 2.83 1.41-1.41L1.41 0H0v1.41zM38.59 40l-2.83-2.83 1.41-1.41L40 38.59V40h-1.41zM40 1.41l-2.83 2.83-1.41-1.41L38.59 0H40v1.41zM20 18.6l2.83-2.83 1.41 1.41L21.41 20l2.83 2.83-1.41 1.41L20 21.41l-2.83 2.83-1.41-1.41L18.59 20l-2.83-2.83 1.41-1.41L20 18.59z'/%3E%3C/g%3E%3C/g%3E%3C/svg%3E");
+        border: 1px solid var(--border-color);
+        border-radius: 16px;
+        box-shadow: 0 4px 12px rgba(0,0,0,0.05);
+    }
+    #title {
+        font-weight: 800;
+        font-size: 2.5em;
+        letter-spacing: -0.02em;
+        color: var(--text-color);
+        margin-bottom: 0.25em;
+    }
+    .detect-grad {
+        background: -webkit-linear-gradient(left, #ff8c8c, #ffc89e);
+        -webkit-background-clip: text;
+        -webkit-text-fill-color: transparent;
+        font-weight: 800;
+    }
+    .anyllm-grad {
+        background: -webkit-linear-gradient(left, #a0e6ff, #aaffd4);
+        -webkit-background-clip: text;
+        -webkit-text-fill-color: transparent;
+        font-weight: 800;
+    }
+    #authors {
+        font-size: 1.1em;
+        color: #6b7280;
+        margin: 0;
+    }
+    #main-container {
+        max-width: 1200px;
+        margin: 0 auto;
+        padding: 0 1rem;
+        gap: 2rem; /* Add gap for consistent spacing */
+    }
+    #controls-row {
+        justify-content: center;
+        gap: 2rem;
+    }
+    /* Custom styles for Radio Button Groups */
+    #controls-row > div {
+        background-color: var(--background-card);
+        background-image: url("data:image/svg+xml,%3Csvg xmlns='http://www.w3.org/2000/svg' width='40' height='40' viewBox='0 0 40 40'%3E%3Cg fill-rule='evenodd'%3E%3Cg fill='%23e5e7eb' fill-opacity='0.3'%3E%3Cpath d='M0 38.59l2.83-2.83 1.41 1.41L1.41 40H0v-1.41zM0 1.4l2.83 2.83 1.41-1.41L1.41 0H0v1.41zM38.59 40l-2.83-2.83 1.41-1.41L40 38.59V40h-1.41zM40 1.41l-2.83 2.83-1.41-1.41L38.59 0H40v1.41zM20 18.6l2.83-2.83 1.41 1.41L21.41 20l2.83 2.83-1.41 1.41L20 21.41l-2.83 2.83-1.41-1.41L18.59 20l-2.83-2.83 1.41-1.41L20 18.59z'/%3E%3C/g%3E%3C/g%3E%3C/svg%3E");
+        border: 1px solid var(--border-color);
+        border-radius: 16px;
+        padding: 1rem;
+        box-shadow: 0 4px 12px rgba(0,0,0,0.05);
+    }
+    #controls-row .gradio-button {
+        border-radius: 10px !important;
+        transition: background-color 0.2s ease, color 0.2s ease;
+    }
+    #controls-row .gradio-button.selected {
+        background: var(--accent-color) !important;
+        color: white !important;
+        border-color: var(--accent-color) !important;
+    }
+    #content-row {
+        gap: 1.5rem;
+    }
+    .card {
+        background-color: var(--background-card);
+        background-image: url("data:image/svg+xml,%3Csvg xmlns='http://www.w3.org/2000/svg' width='40' height='40' viewBox='0 0 40 40'%3E%3Cg fill-rule='evenodd'%3E%3Cg fill='%23e5e7eb' fill-opacity='0.3'%3E%3Cpath d='M0 38.59l2.83-2.83 1.41 1.41L1.41 40H0v-1.41zM0 1.4l2.83 2.83 1.41-1.41L1.41 0H0v1.41zM38.59 40l-2.83-2.83 1.41-1.41L40 38.59V40h-1.41zM40 1.41l-2.83 2.83-1.41-1.41L38.59 0H40v1.41zM20 18.6l2.83-2.83 1.41 1.41L21.41 20l2.83 2.83-1.41 1.41L20 21.41l-2.83 2.83-1.41-1.41L18.59 20l-2.83-2.83 1.41-1.41L20 18.59z'/%3E%3C/g%3E%3C/g%3E%3C/svg%3E");
+        border: 1px solid var(--border-color);
+        border-radius: 16px;
+        padding: 1.5rem;
+        box-shadow: 0 4px 12px rgba(0,0,0,0.05);
+        height: 100%;
+        display: flex;
+        flex-direction: column;
+        gap: 1rem;
+    }
+    .card-title {
+        font-weight: 600;
+        font-size: 1.2rem;
+        color: var(--text-color);
+        padding-bottom: 0.75rem;
+        border-bottom: 1px solid var(--border-color);
+    }
+    #input-text textarea {
+        flex-grow: 1;
+        border: none !important;
+        box-shadow: none !important;
+        padding: 0 !important;
+        font-size: 1.1em;
+        line-height: 1.7;
+    }
+    #result-html {
+        flex-grow: 1;
+        font-size: 1.1em;
+        line-height: 1.7;
+        overflow-y: auto;
+        height: 520px;
+    }
+    #input-footer {
+        display: flex;
+        justify-content: space-between;
+        align-items: center;
+        margin-top: auto; /* Push to bottom */
+    }
+    #char-counter {
+        font-size: 0.9em;
+        color: #9ca3af;
+    }
+    #char-counter.error {
+        color: #ef4444;
+    }
+    #submit-btn {
+        flex-grow: 1;
+        max-width: 200px;
+        font-size: 1.05em;
+        font-weight: 600;
+        background: var(--accent-color);
+        color: white;
+        border-radius: 10px;
+    }
+     #submit-btn:hover {
+        background: #4f46e5;
+     }
+    .disclaimer {
+        text-align: center;
+        margin: 0 auto; /* Remove vertical margins */
+        color: #64748b;
+        font-size: 1.1em;
+        max-width: 800px;
+    }
+     /* Reveal 动画更丝滑 */
+    @keyframes reveal {
+        from { opacity: 0; }
+        to { opacity: 1; }
+    }
+    .reveal-char {
+        opacity: 0;
+        animation: reveal 0.2s forwards;
+        white-space: pre-wrap;
+    }
+""") as demo:
+    with gr.Column(elem_id="main-container"):
+        gr.Markdown("""
+            <div id="header">
+                <h1 id="title"><span class="detect-grad">Detect</span><span class="anyllm-grad">AnyLLM</span>: Towards Generalizable and Robust Detection of Machine-Generated Text Across Domains and Models</h1>
+                <p id="authors">Jiachen Fu, Chun-Le Guo, Chongyi Li</p>
+            </div>
+        """)
+        with gr.Row(elem_id="controls-row"):
+            language_radio = gr.Radio(
+                choices=["English", "Chinese"],
+                value="English",
+                label="🌐 Language",
+                interactive=True
+            )
+            mode_radio = gr.Radio(
+                choices=["Text-Only", "LaTex"],
+                value="Text-Only",
+                label="✍️ Input Type",
+                interactive=True
+            )
+        with gr.Row(equal_height=True, elem_id="content-row"):
+            with gr.Column(scale=1, min_width=500):
+                with gr.Column(elem_classes="card"):
+                    gr.HTML('<div class="card-title">📝 Input</div>')
+                    upload_btn = gr.File(
+                        label="Upload File (txt, docx)",
+                        file_types=['.txt', '.docx'],
+                        elem_id="upload-btn"
+                    )
+                    input_text = gr.Textbox(
+                        show_label=False,
+                        placeholder="Enter text to detect or upload a file...",
+                        lines=15,
+                        elem_id="input-text",
+                        max_length=100000,
+                    )
+                    with gr.Row(elem_id="input-footer"):
+                        counter_html = gr.HTML("<div id='char-counter'>0/100000</div>")
+                        submit_btn = gr.Button("✨ Detect", variant="primary", elem_id="submit-btn")
+            with gr.Column(scale=1, min_width=500):
+                with gr.Column(elem_classes="card"):
+                    gr.HTML('<div class="card-title">🔍 Result</div>')
+                    result = gr.HTML(elem_id="result-html")
+        gr.HTML("""
+        <div class="disclaimer">
+            💡 <i><b style="color: red;">Red fonts</b> indicate a high probability of AI generation. <b style="color: orange;">Orange fonts</b> indicate a high probability of AI revision or polishing. The detection results are for reference only.</i>
+        </div>
+        """)
+    upload_btn.upload(
+        read_file_content,
+        inputs=upload_btn,
+        outputs=input_text
+    )
+    input_text.input(
+        None,
+        [input_text],
+        None,
+        js="""
+        (text) => {
+            setTimeout(() => {
+                const counter = document.getElementById("char-counter");
+                if (counter) {
+                    const length = text.length;
+                    counter.innerHTML = `${length}/100000`;
+                    counter.classList.toggle("error", length > 100000);
+                }
+            }, 0);
+            return text;
+        }
+        """
+    )
+    submit_btn.click(
+        greet,
+        inputs=[mode_radio, language_radio, input_text],
+        outputs=result
+    )
+    demo.launch(share=True)

core/model.py ADDED Viewed

	@@ -0,0 +1,255 @@

+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import sys
+import os
+import time
+import copy
+from peft import get_peft_model, LoraConfig, TaskType, AutoPeftModelForCausalLM
+from transformers import AutoModelForCausalLM, AutoTokenizer
+def from_pretrained(cls, model_name, kwargs, cache_dir):
+    # use local model if it exists
+    if "/" in model_name:
+        local_path = os.path.join(cache_dir, model_name.split("/")[-1])
+    else:
+        local_path = os.path.join(cache_dir, model_name)
+    if os.path.exists(local_path):
+        return cls.from_pretrained(local_path, **kwargs)
+    return cls.from_pretrained(model_name, **kwargs, cache_dir=cache_dir, device_map='auto')
+class DiscrepancyEstimator(nn.Module):
+    def __init__(self,
+                 scoring_model_name: str=None,
+                 reference_model_name: str=None,
+                 scoring_model: AutoModelForCausalLM=None,
+                 reference_model: AutoModelForCausalLM=None,
+                 scoring_tokenizer: AutoTokenizer=None,
+                 reference_tokenizer: AutoTokenizer=None,
+                 cache_dir: str=None,
+                 train_method: str='DDL',
+                 pretrained_ckpt: str=None,
+                 ):
+        super().__init__()
+        assert train_method in ['DDL', 'SPO'], 'train_method should be DDL or SPO.'
+        self.train_method = train_method
+        self.cache_dir = cache_dir
+        if pretrained_ckpt is not None:
+            self.load_pretrained(pretrained_ckpt)
+        else:
+            if scoring_model_name is not None:
+                if 'gpt-j' in scoring_model_name or 'GPT-J' in scoring_model_name:
+                    model_kwargs = dict(
+                        torch_dtype=torch.float16,
+                        revision='float16'
+                    )
+                else:
+                    model_kwargs = {}
+                self.scoring_model_name = scoring_model_name
+                self.scoring_model = from_pretrained(AutoModelForCausalLM,
+                                                     scoring_model_name,
+                                                     cache_dir=cache_dir,
+                                                     kwargs=model_kwargs)
+                self.scoring_tokenizer = from_pretrained(AutoTokenizer,
+                                                         scoring_model_name,
+                                                         kwargs={'padding_side': 'right',
+                                                                 'use_fast': True if 'facebook/opt-' not in scoring_model_name else False},
+                                                         cache_dir=cache_dir,)
+            else:
+                if scoring_model is None or scoring_tokenizer is None:
+                    raise ValueError('You should provide scoring_model_name or scoring_model and scoring_tokenizer.')
+                self.scoring_model = scoring_model
+                self.scoring_tokenizer = scoring_tokenizer
+                self.scoring_model_name = scoring_model.config._name_or_path
+            if self.scoring_tokenizer.pad_token is None:
+                self.scoring_tokenizer.pad_token = self.scoring_tokenizer.eos_token
+                self.scoring_tokenizer.pad_token_id = self.scoring_tokenizer.eos_token_id
+            if reference_model_name is not None:
+                if 'gpt-j' in reference_model_name or 'GPT-J' in reference_model_name:
+                    model_kwargs = dict(
+                        torch_dtype=torch.float16,
+                        revision='float16'
+                        )
+                else:
+                    model_kwargs = {}
+                self.reference_model = from_pretrained(AutoModelForCausalLM,
+                                                       reference_model_name,
+                                                       cache_dir=cache_dir,
+                                                       kwargs=model_kwargs)
+                self.reference_tokenizer = from_pretrained(AutoTokenizer,
+                                                           reference_model_name,
+                                                           kwargs={'padding_side': 'right',
+                                                                   'use_fast': True if 'facebook/opt-' not in reference_model_name else False},
+                                                           cache_dir=cache_dir,)
+                self.reference_model_name = reference_model_name
+            else:
+                if reference_model is None and reference_tokenizer is None:
+                    if train_method == 'DDL':
+                        self.reference_model = None
+                        self.reference_tokenizer = None
+                        self.reference_model_name = None
+                    else:
+                        self.reference_model = copy.deepcopy(self.scoring_model)
+                        self.reference_tokenizer = self.scoring_tokenizer
+                        self.reference_model_name = self.reference_model.config._name_or_path
+                elif reference_model is not None and reference_tokenizer is not None:
+                    self.reference_model = reference_model
+                    self.reference_tokenizer = reference_tokenizer
+                    self.reference_model_name = reference_model.config._name_or_path
+                else:
+                    raise ValueError('You should provide reference_model and reference_tokenizer at the same time.')
+            if self.reference_tokenizer is not None:
+                if self.reference_tokenizer.pad_token is None:
+                    self.reference_tokenizer.pad_token = self.reference_tokenizer.eos_token
+                    self.reference_tokenizer.pad_token_id = self.reference_tokenizer.eos_token_id
+    def add_lora_config(self, lora_config: LoraConfig):
+        self.lora_config = lora_config
+        self.scoring_model = get_peft_model(self.scoring_model, self.lora_config)
+    def load_pretrained(self, load_directory, load_directory_ref=None):
+        """
+        Load the model's state_dict from the specified directory.
+        """
+        if not os.path.exists(load_directory):
+            raise ValueError(f"Directory {load_directory} does not exist.")
+        if 'gpt-j' in load_directory or 'GPT-J' in load_directory:
+            model_kwargs = dict(
+                torch_dtype=torch.float16,
+                revision='float16'
+            )
+        else:
+            model_kwargs = {}
+        self.scoring_model = AutoPeftModelForCausalLM.from_pretrained(load_directory, **model_kwargs)
+        self.scoring_tokenizer = AutoTokenizer.from_pretrained(load_directory)
+        self.scoring_model_name = self.scoring_model.config._name_or_path
+        if load_directory_ref:
+            self.reference_model = AutoModelForCausalLM.from_pretrained(load_directory_ref, **model_kwargs)
+            self.reference_tokenizer = AutoTokenizer.from_pretrained(load_directory_ref)
+            self.reference_model_name = self.reference_model.config._name_or_path
+        else:
+            self.reference_model = None
+            self.reference_tokenizer = None
+            self.reference_model_name = None
+        if self.scoring_tokenizer.pad_token is None:
+            self.scoring_tokenizer.pad_token = self.scoring_tokenizer.eos_token
+            self.scoring_tokenizer.pad_token_id = self.scoring_tokenizer.eos_token_id
+        if self.reference_tokenizer is not None:
+            if self.reference_tokenizer.pad_token is None:
+                self.reference_tokenizer.pad_token = self.reference_tokenizer.eos_token
+                self.reference_tokenizer.pad_token_id = self.reference_tokenizer.eos_token_id
+    def get_sampling_discrepancy_analytic(self, reference_logits, scoring_logits, labels, attention_mask):
+        if reference_logits.size(-1) != scoring_logits.size(-1):
+            vocab_size = min(reference_logits.size(-1), scoring_logits.size(-1))
+            reference_logits = reference_logits[:, :, :vocab_size]
+            scoring_logits = scoring_logits[:, :, :vocab_size]
+        labels = labels.unsqueeze(-1) if labels.ndim == scoring_logits.ndim - 1 else labels
+        lprobs_score = torch.log_softmax(scoring_logits, dim=-1)
+        probs_ref = torch.softmax(reference_logits, dim=-1)
+        log_likelihood = lprobs_score.gather(dim=-1, index=labels).squeeze(-1)
+        mean_ref = (probs_ref * lprobs_score).sum(dim=-1)
+        var_ref = (probs_ref * torch.square(lprobs_score)).sum(dim=-1) - torch.square(mean_ref)
+        mask = attention_mask[:, 1:].float()  # [bsz, seq_len-1], 1 for non-pad, 0 for pad
+        log_likelihood_sum = (log_likelihood * mask).sum(dim=-1)  # [bsz], sum over non-pad tokens
+        mean_ref_sum = (mean_ref * mask).sum(dim=-1)  # [bsz], sum over non-pad tokens
+        var_ref_sum = (var_ref * mask).sum(dim=-1)  # [bsz], sum over non-pad tokens
+        discrepancy = (log_likelihood_sum - mean_ref_sum) / (var_ref_sum.sqrt() + 1e-8)  # [bsz], avoid division by zero
+        return discrepancy, log_likelihood_sum
+    def get_discrepancy_of_scoring_and_reference_models(self,
+                                                        input_ids_for_scoring_model,
+                                                        attention_mask_for_scoring_model,
+                                                        input_ids_for_reference_model=None,
+                                                        attention_mask_for_reference_model=None,
+                                                        ) -> dict:
+        labels = input_ids_for_scoring_model[:, 1:] # shape: [bsz, sentence_len - 1]
+        scoring_logits = self.scoring_model(input_ids_for_scoring_model,
+                                            attention_mask=attention_mask_for_scoring_model).logits[:,:-1,:]
+        if self.reference_model is not None:
+            assert input_ids_for_reference_model is not None and attention_mask_for_reference_model is not None, \
+                "If reference_model is provided, you should provide reference_tokenizer to dataset initialization."
+            with torch.no_grad():
+                # check if tokenizer is the match
+                reference_labels = input_ids_for_reference_model[:, 1:] # shape: [bsz, sentence_len]
+                assert torch.all(reference_labels == labels), \
+                    "Tokenizer is mismatch."
+                reference_logits = self.reference_model(input_ids_for_reference_model,
+                                                        attention_mask=attention_mask_for_reference_model).logits[:,:-1,:]
+        else:
+            reference_logits = scoring_logits
+        if self.reference_model is not None:
+            discrepancy_ref, logprob_ref = self.get_sampling_discrepancy_analytic(reference_logits, reference_logits,
+                                                                                  labels, attention_mask=attention_mask_for_reference_model)
+        else:
+            discrepancy_ref, logprob_ref = None, None
+        discrepancy_score, logprob_score = self.get_sampling_discrepancy_analytic(reference_logits, scoring_logits,
+                                                                                  labels, attention_mask=attention_mask_for_scoring_model)
+        return {
+            'scoring_discrepancy': discrepancy_score,
+            'scoring_logprob': logprob_score,
+            'reference_discrepancy': discrepancy_ref,
+            'reference_logprob': logprob_ref,
+        }
+    def forward(self,
+                scoring_original_input_ids,
+                scoring_original_attention_mask,
+                scoring_rewritten_input_ids,
+                scoring_rewritten_attention_mask,
+                reference_original_input_ids=None,
+                reference_original_attention_mask=None,
+                reference_rewritten_input_ids=None,
+                reference_rewritten_attention_mask=None,
+                ) -> dict:
+        if self.train_method == 'SPO':
+            assert reference_original_input_ids is not None and reference_original_attention_mask is not None, \
+                "If train_method is SPO, you should provide reference_original_input_ids and reference_original_attention_mask."
+            assert reference_rewritten_input_ids is not None and reference_rewritten_attention_mask is not None, \
+                "If train_method is SPO, you should provide reference_rewritten_input_ids and reference_rewritten_attention_mask."
+        elif self.train_method == 'DDL':
+            assert reference_original_input_ids is None and reference_original_attention_mask is None, \
+                "If train_method is DDL, you should not provide reference_original_input_ids and reference_original_attention_mask."
+            assert reference_rewritten_input_ids is None and reference_rewritten_attention_mask is None, \
+                "If train_method is DDL, you should not provide reference_rewritten_input_ids and reference_rewritten_attention_mask."
+        else:
+            raise ValueError('train_method should be DDL or SPO.')
+        original_output = self.get_discrepancy_of_scoring_and_reference_models(
+            input_ids_for_scoring_model=scoring_original_input_ids,
+            attention_mask_for_scoring_model=scoring_original_attention_mask,
+            input_ids_for_reference_model=reference_original_input_ids,
+            attention_mask_for_reference_model=reference_original_attention_mask,
+        )
+        rewritten_output = self.get_discrepancy_of_scoring_and_reference_models(
+            input_ids_for_scoring_model=scoring_rewritten_input_ids,
+            attention_mask_for_scoring_model=scoring_rewritten_attention_mask,
+            input_ids_for_reference_model=reference_rewritten_input_ids,
+            attention_mask_for_reference_model=reference_rewritten_attention_mask,
+        )
+        return {
+            'scoring_original_discrepancy': original_output['scoring_discrepancy'],
+            'scoring_original_logprob': original_output['scoring_logprob'],
+            'scoring_rewritten_discrepancy': rewritten_output['scoring_discrepancy'],
+            'scoring_rewritten_logprob': rewritten_output['scoring_logprob'],
+            'reference_original_discrepancy': original_output['reference_discrepancy'],
+            'reference_original_logprob': original_output['reference_logprob'],
+            'reference_rewritten_discrepancy': rewritten_output['reference_discrepancy'],
+            'reference_rewritten_logprob': rewritten_output['reference_logprob'],
+        }

requirements.txt ADDED Viewed

	@@ -0,0 +1,10 @@

+peft
+torch
+transformers
+protobuf
+python-docx
+gradio
+numpy
+huggingface_hub
+datasets
+spaces