Spaces:
Sleeping
Sleeping
File size: 4,696 Bytes
4bcf02b baabe35 a93c020 40f65bf baabe35 4bcf02b addd1a8 4bcf02b ad12482 4bcf02b c41212f af2cb36 4bcf02b 5fd33a9 4bcf02b addd1a8 baabe35 4bcf02b 5fd33a9 af005d6 5fd33a9 602cd0f 69c8fc6 af005d6 ad12482 af005d6 4e7fa9f 40f65bf af005d6 40f65bf 4e7fa9f af005d6 75f1c2d 40f65bf 4e7fa9f addd1a8 4bcf02b 832b0d9 bec2c97 832b0d9 bec2c97 a93c020 4bcf02b bec2c97 a93c020 4bcf02b addd1a8 a93c020 4bcf02b bec2c97 4bcf02b bec2c97 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 | import gradio as gr
import torch
import transformers
import spaces
from synthid_text import synthid_mixin,logits_processing
from synthid_text.detector_mean import mean_score
# Configurations and model selection
MODEL_NAME = "google/gemma-7b-it"
DEVICE = torch.device('cuda:0') if torch.cuda.is_available() else torch.device('cpu')
TOP_K = 40
TOP_P = 0.99
TEMPERATURE= 0.5
# Initialize model and tokenizer
# model = synthid_mixin.SynthIDGemmaForCausalLM.from_pretrained(
# MODEL_NAME,
# device_map=DEVICE,
# torch_dtype=torch.bfloat16,
# )
tokenizer = transformers.AutoTokenizer.from_pretrained(MODEL_NAME)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "left"
# Watermarking configuration
CONFIG = synthid_mixin.DEFAULT_WATERMARKING_CONFIG
# Function to check for AI-generated content using SynthID and highlight watermark
@spaces.GPU
def check_plagiarism(text):
# Logits processor for SynthID
logits_processor = logits_processing.SynthIDLogitsProcessor(
**CONFIG, top_k=40, temperature=0.5
)
# Tokenize and process the input text
inputs = tokenizer(text, return_tensors="pt").to(DEVICE)
# Extract token IDs for the input text only
input_ids = inputs['input_ids']
# Compute masks for watermark detection
eos_token_mask = logits_processor.compute_eos_token_mask(
input_ids=input_ids,
eos_token_id=tokenizer.eos_token_id,
)[:, CONFIG['ngram_len'] - 1:]
context_repetition_mask = logits_processor.compute_context_repetition_mask(
input_ids=input_ids
)
# Combine the masks
combined_mask = context_repetition_mask * eos_token_mask
# Compute G values for the input text
g_values = logits_processor.compute_g_values(input_ids=input_ids)
# Score the G values with the combined mask
score = mean_score(g_values.cpu().numpy(), combined_mask.cpu().numpy())
# Initialize string to store highlighted output
highlighted_text = ""
# Loop through each token in the input text and apply highlighting if it meets the watermark criteria
for token_id, g_val, mask in zip(input_ids[0], g_values[0], combined_mask[0]):
token_text = tokenizer.decode(token_id.unsqueeze(0))
# Convert g_val to float and highlight if it meets the threshold
if mask.item() and g_val.float().mean().item() > 0.55:
highlighted_text += f"<mark>{token_text}</mark>" # Highlight watermarked content
else:
highlighted_text += token_text
# Return the highlighted text and overall watermark score
if score > 0.5:
return f"Flagged as AI-generated content (Academic Integrity Warning): {highlighted_text}"
else:
return f"Content appears to be human-generated. {highlighted_text}"
# Define the Gradio interface
def create_plagiarism_checker():
with gr.Blocks() as app:
# Add custom CSS styling using gr.HTML
gr.HTML("""
<style>
#text_input { font-size: 16px; border: 1px solid #ddd; padding: 8px; }
#output { font-size: 16px; padding: 8px; border-radius: 5px; }
#check_button { font-size: 16px; background-color: #4CAF50; color: white; border: none; padding: 10px 20px; cursor: pointer; }
#check_button:hover { background-color: #45a049; }
</style>
""")
# Title and description
gr.Markdown("""
# ๐ Plagiarism and Academic Integrity Checker
Use this tool to detect AI-generated content in your text using SynthID technology.
Paste your text below to check if it contains AI-generated segments.
---
""")
# Layout the components
with gr.Row():
# Input textbox for users to paste text
text_input = gr.Textbox(
placeholder="Paste your text here...",
label="Input Text",
lines=10,
max_lines=20,
elem_id="text_input",
)
# Divider for clarity
gr.Markdown("---")
# Output box to display the result with highlighted watermark
output = gr.HTML(label="Integrity Check Result", elem_id="output")
# Button to initiate the check, styled with a color accent
check_button = gr.Button("๐ Check Text", elem_id="check_button")
# Define the click event for the button
check_button.click(fn=check_plagiarism, inputs=text_input, outputs=output)
return app
# Launch the app
plagiarism_checker_app = create_plagiarism_checker()
plagiarism_checker_app.launch() |