File size: 4,696 Bytes
4bcf02b
 
 
baabe35
a93c020
40f65bf
baabe35
4bcf02b
addd1a8
4bcf02b
ad12482
 
 
4bcf02b
c41212f
af2cb36
 
 
 
 
4bcf02b
 
 
 
 
 
 
5fd33a9
4bcf02b
addd1a8
baabe35
4bcf02b
5fd33a9
 
af005d6
5fd33a9
602cd0f
69c8fc6
af005d6
 
 
 
ad12482
af005d6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4e7fa9f
40f65bf
 
 
 
 
 
af005d6
 
40f65bf
4e7fa9f
af005d6
75f1c2d
40f65bf
 
 
4e7fa9f
 
 
 
 
 
addd1a8
4bcf02b
 
 
 
832b0d9
 
bec2c97
 
 
 
 
 
832b0d9
bec2c97
 
a93c020
 
 
 
 
 
4bcf02b
bec2c97
a93c020
 
 
 
 
 
 
 
 
 
 
 
4bcf02b
addd1a8
a93c020
 
 
 
4bcf02b
 
 
 
 
 
bec2c97
4bcf02b
bec2c97
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
import gradio as gr
import torch
import transformers
import spaces
from synthid_text import  synthid_mixin,logits_processing
from synthid_text.detector_mean import mean_score

# Configurations and model selection
MODEL_NAME = "google/gemma-7b-it"
DEVICE = torch.device('cuda:0') if torch.cuda.is_available() else torch.device('cpu')
TOP_K = 40
TOP_P = 0.99
TEMPERATURE= 0.5

# Initialize model and tokenizer
# model = synthid_mixin.SynthIDGemmaForCausalLM.from_pretrained(
#     MODEL_NAME,
#     device_map=DEVICE,
#     torch_dtype=torch.bfloat16,
# )
tokenizer = transformers.AutoTokenizer.from_pretrained(MODEL_NAME)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "left"

# Watermarking configuration
CONFIG = synthid_mixin.DEFAULT_WATERMARKING_CONFIG



# Function to check for AI-generated content using SynthID and highlight watermark
@spaces.GPU
def check_plagiarism(text):
    # Logits processor for SynthID
    logits_processor = logits_processing.SynthIDLogitsProcessor(
        **CONFIG, top_k=40, temperature=0.5
    )
    
    # Tokenize and process the input text
    inputs = tokenizer(text, return_tensors="pt").to(DEVICE)
    
    # Extract token IDs for the input text only
    input_ids = inputs['input_ids']

    # Compute masks for watermark detection
    eos_token_mask = logits_processor.compute_eos_token_mask(
        input_ids=input_ids,
        eos_token_id=tokenizer.eos_token_id,
    )[:, CONFIG['ngram_len'] - 1:]
    
    context_repetition_mask = logits_processor.compute_context_repetition_mask(
        input_ids=input_ids
    )
    
    # Combine the masks
    combined_mask = context_repetition_mask * eos_token_mask
    
    # Compute G values for the input text
    g_values = logits_processor.compute_g_values(input_ids=input_ids)
    
    # Score the G values with the combined mask
    score = mean_score(g_values.cpu().numpy(), combined_mask.cpu().numpy())
    
    # Initialize string to store highlighted output
    highlighted_text = ""
    
    # Loop through each token in the input text and apply highlighting if it meets the watermark criteria
    for token_id, g_val, mask in zip(input_ids[0], g_values[0], combined_mask[0]):
        token_text = tokenizer.decode(token_id.unsqueeze(0))
        
        # Convert g_val to float and highlight if it meets the threshold
        if mask.item() and g_val.float().mean().item() > 0.55:
            highlighted_text += f"<mark>{token_text}</mark>"  # Highlight watermarked content
        else:
            highlighted_text += token_text

    # Return the highlighted text and overall watermark score
    if score > 0.5:
        return f"Flagged as AI-generated content (Academic Integrity Warning): {highlighted_text}"
    else:
        return f"Content appears to be human-generated. {highlighted_text}"


# Define the Gradio interface
def create_plagiarism_checker():
    with gr.Blocks() as app:
        # Add custom CSS styling using gr.HTML
        gr.HTML("""
            <style>
                #text_input { font-size: 16px; border: 1px solid #ddd; padding: 8px; }
                #output { font-size: 16px; padding: 8px; border-radius: 5px; }
                #check_button { font-size: 16px; background-color: #4CAF50; color: white; border: none; padding: 10px 20px; cursor: pointer; }
                #check_button:hover { background-color: #45a049; }
            </style>
        """)

        # Title and description
        gr.Markdown("""
            # ๐Ÿ“ Plagiarism and Academic Integrity Checker
            Use this tool to detect AI-generated content in your text using SynthID technology. 
            Paste your text below to check if it contains AI-generated segments.
            ---
        """)

        # Layout the components
        with gr.Row():
            # Input textbox for users to paste text
            text_input = gr.Textbox(
                placeholder="Paste your text here...",
                label="Input Text",
                lines=10,
                max_lines=20,
                elem_id="text_input",
            )
        
        # Divider for clarity
        gr.Markdown("---")

        # Output box to display the result with highlighted watermark
        output = gr.HTML(label="Integrity Check Result", elem_id="output")

        # Button to initiate the check, styled with a color accent
        check_button = gr.Button("๐Ÿ” Check Text", elem_id="check_button")
        
        # Define the click event for the button
        check_button.click(fn=check_plagiarism, inputs=text_input, outputs=output)

    return app

# Launch the app
plagiarism_checker_app = create_plagiarism_checker()
plagiarism_checker_app.launch()